Horizon
lexer.hpp
1 #pragma once
2 
3 #include <array> // array
4 #include <clocale> // localeconv
5 #include <cstddef> // size_t
6 #include <cstdio> // snprintf
7 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8 #include <initializer_list> // initializer_list
9 #include <string> // char_traits, string
10 #include <utility> // move
11 #include <vector> // vector
12 
13 #include <nlohmann/detail/input/input_adapters.hpp>
14 #include <nlohmann/detail/input/position_t.hpp>
15 #include <nlohmann/detail/macro_scope.hpp>
16 
17 namespace nlohmann
18 {
19 namespace detail
20 {
22 // lexer //
24 
30 template<typename BasicJsonType>
31 class lexer
32 {
33  using number_integer_t = typename BasicJsonType::number_integer_t;
34  using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
35  using number_float_t = typename BasicJsonType::number_float_t;
36  using string_t = typename BasicJsonType::string_t;
37 
38  public:
40  enum class token_type
41  {
43  literal_true,
45  literal_null,
46  value_string,
49  value_float,
50  begin_array,
51  begin_object,
52  end_array,
53  end_object,
56  parse_error,
57  end_of_input,
59  };
60 
62  JSON_HEDLEY_RETURNS_NON_NULL
63  JSON_HEDLEY_CONST
64  static const char* token_type_name(const token_type t) noexcept
65  {
66  switch (t)
67  {
69  return "<uninitialized>";
71  return "true literal";
73  return "false literal";
75  return "null literal";
77  return "string literal";
81  return "number literal";
83  return "'['";
85  return "'{'";
87  return "']'";
89  return "'}'";
91  return "':'";
93  return "','";
95  return "<parse error>";
97  return "end of input";
99  return "'[', '{', or a literal";
100  // LCOV_EXCL_START
101  default: // catch non-enum values
102  return "unknown token";
103  // LCOV_EXCL_STOP
104  }
105  }
106 
107  explicit lexer(detail::input_adapter_t&& adapter)
108  : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
109 
110  // delete because of pointer members
111  lexer(const lexer&) = delete;
112  lexer(lexer&&) = delete;
113  lexer& operator=(lexer&) = delete;
114  lexer& operator=(lexer&&) = delete;
115  ~lexer() = default;
116 
117  private:
119  // locales
121 
123  JSON_HEDLEY_PURE
124  static char get_decimal_point() noexcept
125  {
126  const auto loc = localeconv();
127  assert(loc != nullptr);
128  return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
129  }
130 
132  // scan functions
134 
150  int get_codepoint()
151  {
152  // this function only makes sense after reading `\u`
153  assert(current == 'u');
154  int codepoint = 0;
155 
156  const auto factors = { 12u, 8u, 4u, 0u };
157  for (const auto factor : factors)
158  {
159  get();
160 
161  if (current >= '0' and current <= '9')
162  {
163  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
164  }
165  else if (current >= 'A' and current <= 'F')
166  {
167  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
168  }
169  else if (current >= 'a' and current <= 'f')
170  {
171  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
172  }
173  else
174  {
175  return -1;
176  }
177  }
178 
179  assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
180  return codepoint;
181  }
182 
198  bool next_byte_in_range(std::initializer_list<int> ranges)
199  {
200  assert(ranges.size() == 2 or ranges.size() == 4 or ranges.size() == 6);
201  add(current);
202 
203  for (auto range = ranges.begin(); range != ranges.end(); ++range)
204  {
205  get();
206  if (JSON_HEDLEY_LIKELY(*range <= current and current <= *(++range)))
207  {
208  add(current);
209  }
210  else
211  {
212  error_message = "invalid string: ill-formed UTF-8 byte";
213  return false;
214  }
215  }
216 
217  return true;
218  }
219 
235  token_type scan_string()
236  {
237  // reset token_buffer (ignore opening quote)
238  reset();
239 
240  // we entered the function by reading an open quote
241  assert(current == '\"');
242 
243  while (true)
244  {
245  // get next character
246  switch (get())
247  {
248  // end of file while parsing string
249  case std::char_traits<char>::eof():
250  {
251  error_message = "invalid string: missing closing quote";
253  }
254 
255  // closing quote
256  case '\"':
257  {
258  return token_type::value_string;
259  }
260 
261  // escapes
262  case '\\':
263  {
264  switch (get())
265  {
266  // quotation mark
267  case '\"':
268  add('\"');
269  break;
270  // reverse solidus
271  case '\\':
272  add('\\');
273  break;
274  // solidus
275  case '/':
276  add('/');
277  break;
278  // backspace
279  case 'b':
280  add('\b');
281  break;
282  // form feed
283  case 'f':
284  add('\f');
285  break;
286  // line feed
287  case 'n':
288  add('\n');
289  break;
290  // carriage return
291  case 'r':
292  add('\r');
293  break;
294  // tab
295  case 't':
296  add('\t');
297  break;
298 
299  // unicode escapes
300  case 'u':
301  {
302  const int codepoint1 = get_codepoint();
303  int codepoint = codepoint1; // start with codepoint1
304 
305  if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
306  {
307  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
308  return token_type::parse_error;
309  }
310 
311  // check if code point is a high surrogate
312  if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF)
313  {
314  // expect next \uxxxx entry
315  if (JSON_HEDLEY_LIKELY(get() == '\\' and get() == 'u'))
316  {
317  const int codepoint2 = get_codepoint();
318 
319  if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
320  {
321  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
322  return token_type::parse_error;
323  }
324 
325  // check if codepoint2 is a low surrogate
326  if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
327  {
328  // overwrite codepoint
329  codepoint = static_cast<int>(
330  // high surrogate occupies the most significant 22 bits
331  (static_cast<unsigned int>(codepoint1) << 10u)
332  // low surrogate occupies the least significant 15 bits
333  + static_cast<unsigned int>(codepoint2)
334  // there is still the 0xD800, 0xDC00 and 0x10000 noise
335  // in the result so we have to subtract with:
336  // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
337  - 0x35FDC00u);
338  }
339  else
340  {
341  error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
342  return token_type::parse_error;
343  }
344  }
345  else
346  {
347  error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
348  return token_type::parse_error;
349  }
350  }
351  else
352  {
353  if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF))
354  {
355  error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
356  return token_type::parse_error;
357  }
358  }
359 
360  // result of the above calculation yields a proper codepoint
361  assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
362 
363  // translate codepoint into bytes
364  if (codepoint < 0x80)
365  {
366  // 1-byte characters: 0xxxxxxx (ASCII)
367  add(codepoint);
368  }
369  else if (codepoint <= 0x7FF)
370  {
371  // 2-byte characters: 110xxxxx 10xxxxxx
372  add(static_cast<int>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
373  add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
374  }
375  else if (codepoint <= 0xFFFF)
376  {
377  // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
378  add(static_cast<int>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
379  add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
380  add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
381  }
382  else
383  {
384  // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
385  add(static_cast<int>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
386  add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
387  add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
388  add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
389  }
390 
391  break;
392  }
393 
394  // other characters after escape
395  default:
396  error_message = "invalid string: forbidden character after backslash";
397  return token_type::parse_error;
398  }
399 
400  break;
401  }
402 
403  // invalid control characters
404  case 0x00:
405  {
406  error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
407  return token_type::parse_error;
408  }
409 
410  case 0x01:
411  {
412  error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
413  return token_type::parse_error;
414  }
415 
416  case 0x02:
417  {
418  error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
419  return token_type::parse_error;
420  }
421 
422  case 0x03:
423  {
424  error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
425  return token_type::parse_error;
426  }
427 
428  case 0x04:
429  {
430  error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
431  return token_type::parse_error;
432  }
433 
434  case 0x05:
435  {
436  error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
437  return token_type::parse_error;
438  }
439 
440  case 0x06:
441  {
442  error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
443  return token_type::parse_error;
444  }
445 
446  case 0x07:
447  {
448  error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
449  return token_type::parse_error;
450  }
451 
452  case 0x08:
453  {
454  error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
455  return token_type::parse_error;
456  }
457 
458  case 0x09:
459  {
460  error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
461  return token_type::parse_error;
462  }
463 
464  case 0x0A:
465  {
466  error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
467  return token_type::parse_error;
468  }
469 
470  case 0x0B:
471  {
472  error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
473  return token_type::parse_error;
474  }
475 
476  case 0x0C:
477  {
478  error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
479  return token_type::parse_error;
480  }
481 
482  case 0x0D:
483  {
484  error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
485  return token_type::parse_error;
486  }
487 
488  case 0x0E:
489  {
490  error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
491  return token_type::parse_error;
492  }
493 
494  case 0x0F:
495  {
496  error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
497  return token_type::parse_error;
498  }
499 
500  case 0x10:
501  {
502  error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
503  return token_type::parse_error;
504  }
505 
506  case 0x11:
507  {
508  error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
509  return token_type::parse_error;
510  }
511 
512  case 0x12:
513  {
514  error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
515  return token_type::parse_error;
516  }
517 
518  case 0x13:
519  {
520  error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
521  return token_type::parse_error;
522  }
523 
524  case 0x14:
525  {
526  error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
527  return token_type::parse_error;
528  }
529 
530  case 0x15:
531  {
532  error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
533  return token_type::parse_error;
534  }
535 
536  case 0x16:
537  {
538  error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
539  return token_type::parse_error;
540  }
541 
542  case 0x17:
543  {
544  error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
545  return token_type::parse_error;
546  }
547 
548  case 0x18:
549  {
550  error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
551  return token_type::parse_error;
552  }
553 
554  case 0x19:
555  {
556  error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
557  return token_type::parse_error;
558  }
559 
560  case 0x1A:
561  {
562  error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
563  return token_type::parse_error;
564  }
565 
566  case 0x1B:
567  {
568  error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
569  return token_type::parse_error;
570  }
571 
572  case 0x1C:
573  {
574  error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
575  return token_type::parse_error;
576  }
577 
578  case 0x1D:
579  {
580  error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
581  return token_type::parse_error;
582  }
583 
584  case 0x1E:
585  {
586  error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
587  return token_type::parse_error;
588  }
589 
590  case 0x1F:
591  {
592  error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
593  return token_type::parse_error;
594  }
595 
596  // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
597  case 0x20:
598  case 0x21:
599  case 0x23:
600  case 0x24:
601  case 0x25:
602  case 0x26:
603  case 0x27:
604  case 0x28:
605  case 0x29:
606  case 0x2A:
607  case 0x2B:
608  case 0x2C:
609  case 0x2D:
610  case 0x2E:
611  case 0x2F:
612  case 0x30:
613  case 0x31:
614  case 0x32:
615  case 0x33:
616  case 0x34:
617  case 0x35:
618  case 0x36:
619  case 0x37:
620  case 0x38:
621  case 0x39:
622  case 0x3A:
623  case 0x3B:
624  case 0x3C:
625  case 0x3D:
626  case 0x3E:
627  case 0x3F:
628  case 0x40:
629  case 0x41:
630  case 0x42:
631  case 0x43:
632  case 0x44:
633  case 0x45:
634  case 0x46:
635  case 0x47:
636  case 0x48:
637  case 0x49:
638  case 0x4A:
639  case 0x4B:
640  case 0x4C:
641  case 0x4D:
642  case 0x4E:
643  case 0x4F:
644  case 0x50:
645  case 0x51:
646  case 0x52:
647  case 0x53:
648  case 0x54:
649  case 0x55:
650  case 0x56:
651  case 0x57:
652  case 0x58:
653  case 0x59:
654  case 0x5A:
655  case 0x5B:
656  case 0x5D:
657  case 0x5E:
658  case 0x5F:
659  case 0x60:
660  case 0x61:
661  case 0x62:
662  case 0x63:
663  case 0x64:
664  case 0x65:
665  case 0x66:
666  case 0x67:
667  case 0x68:
668  case 0x69:
669  case 0x6A:
670  case 0x6B:
671  case 0x6C:
672  case 0x6D:
673  case 0x6E:
674  case 0x6F:
675  case 0x70:
676  case 0x71:
677  case 0x72:
678  case 0x73:
679  case 0x74:
680  case 0x75:
681  case 0x76:
682  case 0x77:
683  case 0x78:
684  case 0x79:
685  case 0x7A:
686  case 0x7B:
687  case 0x7C:
688  case 0x7D:
689  case 0x7E:
690  case 0x7F:
691  {
692  add(current);
693  break;
694  }
695 
696  // U+0080..U+07FF: bytes C2..DF 80..BF
697  case 0xC2:
698  case 0xC3:
699  case 0xC4:
700  case 0xC5:
701  case 0xC6:
702  case 0xC7:
703  case 0xC8:
704  case 0xC9:
705  case 0xCA:
706  case 0xCB:
707  case 0xCC:
708  case 0xCD:
709  case 0xCE:
710  case 0xCF:
711  case 0xD0:
712  case 0xD1:
713  case 0xD2:
714  case 0xD3:
715  case 0xD4:
716  case 0xD5:
717  case 0xD6:
718  case 0xD7:
719  case 0xD8:
720  case 0xD9:
721  case 0xDA:
722  case 0xDB:
723  case 0xDC:
724  case 0xDD:
725  case 0xDE:
726  case 0xDF:
727  {
728  if (JSON_HEDLEY_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
729  {
730  return token_type::parse_error;
731  }
732  break;
733  }
734 
735  // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
736  case 0xE0:
737  {
738  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
739  {
740  return token_type::parse_error;
741  }
742  break;
743  }
744 
745  // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
746  // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
747  case 0xE1:
748  case 0xE2:
749  case 0xE3:
750  case 0xE4:
751  case 0xE5:
752  case 0xE6:
753  case 0xE7:
754  case 0xE8:
755  case 0xE9:
756  case 0xEA:
757  case 0xEB:
758  case 0xEC:
759  case 0xEE:
760  case 0xEF:
761  {
762  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
763  {
764  return token_type::parse_error;
765  }
766  break;
767  }
768 
769  // U+D000..U+D7FF: bytes ED 80..9F 80..BF
770  case 0xED:
771  {
772  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
773  {
774  return token_type::parse_error;
775  }
776  break;
777  }
778 
779  // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
780  case 0xF0:
781  {
782  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
783  {
784  return token_type::parse_error;
785  }
786  break;
787  }
788 
789  // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
790  case 0xF1:
791  case 0xF2:
792  case 0xF3:
793  {
794  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
795  {
796  return token_type::parse_error;
797  }
798  break;
799  }
800 
801  // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
802  case 0xF4:
803  {
804  if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
805  {
806  return token_type::parse_error;
807  }
808  break;
809  }
810 
811  // remaining bytes (80..C1 and F5..FF) are ill-formed
812  default:
813  {
814  error_message = "invalid string: ill-formed UTF-8 byte";
815  return token_type::parse_error;
816  }
817  }
818  }
819  }
820 
821  JSON_HEDLEY_NON_NULL(2)
822  static void strtof(float& f, const char* str, char** endptr) noexcept
823  {
824  f = std::strtof(str, endptr);
825  }
826 
827  JSON_HEDLEY_NON_NULL(2)
828  static void strtof(double& f, const char* str, char** endptr) noexcept
829  {
830  f = std::strtod(str, endptr);
831  }
832 
833  JSON_HEDLEY_NON_NULL(2)
834  static void strtof(long double& f, const char* str, char** endptr) noexcept
835  {
836  f = std::strtold(str, endptr);
837  }
838 
879  token_type scan_number() // lgtm [cpp/use-of-goto]
880  {
881  // reset token_buffer to store the number's bytes
882  reset();
883 
884  // the type of the parsed number; initially set to unsigned; will be
885  // changed if minus sign, decimal point or exponent is read
886  token_type number_type = token_type::value_unsigned;
887 
888  // state (init): we just found out we need to scan a number
889  switch (current)
890  {
891  case '-':
892  {
893  add(current);
894  goto scan_number_minus;
895  }
896 
897  case '0':
898  {
899  add(current);
900  goto scan_number_zero;
901  }
902 
903  case '1':
904  case '2':
905  case '3':
906  case '4':
907  case '5':
908  case '6':
909  case '7':
910  case '8':
911  case '9':
912  {
913  add(current);
914  goto scan_number_any1;
915  }
916 
917  // all other characters are rejected outside scan_number()
918  default: // LCOV_EXCL_LINE
919  assert(false); // LCOV_EXCL_LINE
920  }
921 
922 scan_number_minus:
923  // state: we just parsed a leading minus sign
924  number_type = token_type::value_integer;
925  switch (get())
926  {
927  case '0':
928  {
929  add(current);
930  goto scan_number_zero;
931  }
932 
933  case '1':
934  case '2':
935  case '3':
936  case '4':
937  case '5':
938  case '6':
939  case '7':
940  case '8':
941  case '9':
942  {
943  add(current);
944  goto scan_number_any1;
945  }
946 
947  default:
948  {
949  error_message = "invalid number; expected digit after '-'";
950  return token_type::parse_error;
951  }
952  }
953 
954 scan_number_zero:
955  // state: we just parse a zero (maybe with a leading minus sign)
956  switch (get())
957  {
958  case '.':
959  {
960  add(decimal_point_char);
961  goto scan_number_decimal1;
962  }
963 
964  case 'e':
965  case 'E':
966  {
967  add(current);
968  goto scan_number_exponent;
969  }
970 
971  default:
972  goto scan_number_done;
973  }
974 
975 scan_number_any1:
976  // state: we just parsed a number 0-9 (maybe with a leading minus sign)
977  switch (get())
978  {
979  case '0':
980  case '1':
981  case '2':
982  case '3':
983  case '4':
984  case '5':
985  case '6':
986  case '7':
987  case '8':
988  case '9':
989  {
990  add(current);
991  goto scan_number_any1;
992  }
993 
994  case '.':
995  {
996  add(decimal_point_char);
997  goto scan_number_decimal1;
998  }
999 
1000  case 'e':
1001  case 'E':
1002  {
1003  add(current);
1004  goto scan_number_exponent;
1005  }
1006 
1007  default:
1008  goto scan_number_done;
1009  }
1010 
1011 scan_number_decimal1:
1012  // state: we just parsed a decimal point
1013  number_type = token_type::value_float;
1014  switch (get())
1015  {
1016  case '0':
1017  case '1':
1018  case '2':
1019  case '3':
1020  case '4':
1021  case '5':
1022  case '6':
1023  case '7':
1024  case '8':
1025  case '9':
1026  {
1027  add(current);
1028  goto scan_number_decimal2;
1029  }
1030 
1031  default:
1032  {
1033  error_message = "invalid number; expected digit after '.'";
1034  return token_type::parse_error;
1035  }
1036  }
1037 
1038 scan_number_decimal2:
1039  // we just parsed at least one number after a decimal point
1040  switch (get())
1041  {
1042  case '0':
1043  case '1':
1044  case '2':
1045  case '3':
1046  case '4':
1047  case '5':
1048  case '6':
1049  case '7':
1050  case '8':
1051  case '9':
1052  {
1053  add(current);
1054  goto scan_number_decimal2;
1055  }
1056 
1057  case 'e':
1058  case 'E':
1059  {
1060  add(current);
1061  goto scan_number_exponent;
1062  }
1063 
1064  default:
1065  goto scan_number_done;
1066  }
1067 
1068 scan_number_exponent:
1069  // we just parsed an exponent
1070  number_type = token_type::value_float;
1071  switch (get())
1072  {
1073  case '+':
1074  case '-':
1075  {
1076  add(current);
1077  goto scan_number_sign;
1078  }
1079 
1080  case '0':
1081  case '1':
1082  case '2':
1083  case '3':
1084  case '4':
1085  case '5':
1086  case '6':
1087  case '7':
1088  case '8':
1089  case '9':
1090  {
1091  add(current);
1092  goto scan_number_any2;
1093  }
1094 
1095  default:
1096  {
1097  error_message =
1098  "invalid number; expected '+', '-', or digit after exponent";
1099  return token_type::parse_error;
1100  }
1101  }
1102 
1103 scan_number_sign:
1104  // we just parsed an exponent sign
1105  switch (get())
1106  {
1107  case '0':
1108  case '1':
1109  case '2':
1110  case '3':
1111  case '4':
1112  case '5':
1113  case '6':
1114  case '7':
1115  case '8':
1116  case '9':
1117  {
1118  add(current);
1119  goto scan_number_any2;
1120  }
1121 
1122  default:
1123  {
1124  error_message = "invalid number; expected digit after exponent sign";
1125  return token_type::parse_error;
1126  }
1127  }
1128 
1129 scan_number_any2:
1130  // we just parsed a number after the exponent or exponent sign
1131  switch (get())
1132  {
1133  case '0':
1134  case '1':
1135  case '2':
1136  case '3':
1137  case '4':
1138  case '5':
1139  case '6':
1140  case '7':
1141  case '8':
1142  case '9':
1143  {
1144  add(current);
1145  goto scan_number_any2;
1146  }
1147 
1148  default:
1149  goto scan_number_done;
1150  }
1151 
1152 scan_number_done:
1153  // unget the character after the number (we only read it to know that
1154  // we are done scanning a number)
1155  unget();
1156 
1157  char* endptr = nullptr;
1158  errno = 0;
1159 
1160  // try to parse integers first and fall back to floats
1161  if (number_type == token_type::value_unsigned)
1162  {
1163  const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1164 
1165  // we checked the number format before
1166  assert(endptr == token_buffer.data() + token_buffer.size());
1167 
1168  if (errno == 0)
1169  {
1170  value_unsigned = static_cast<number_unsigned_t>(x);
1171  if (value_unsigned == x)
1172  {
1173  return token_type::value_unsigned;
1174  }
1175  }
1176  }
1177  else if (number_type == token_type::value_integer)
1178  {
1179  const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1180 
1181  // we checked the number format before
1182  assert(endptr == token_buffer.data() + token_buffer.size());
1183 
1184  if (errno == 0)
1185  {
1186  value_integer = static_cast<number_integer_t>(x);
1187  if (value_integer == x)
1188  {
1189  return token_type::value_integer;
1190  }
1191  }
1192  }
1193 
1194  // this code is reached if we parse a floating-point number or if an
1195  // integer conversion above failed
1196  strtof(value_float, token_buffer.data(), &endptr);
1197 
1198  // we checked the number format before
1199  assert(endptr == token_buffer.data() + token_buffer.size());
1200 
1201  return token_type::value_float;
1202  }
1203 
1209  JSON_HEDLEY_NON_NULL(2)
1210  token_type scan_literal(const char* literal_text, const std::size_t length,
1211  token_type return_type)
1212  {
1213  assert(current == literal_text[0]);
1214  for (std::size_t i = 1; i < length; ++i)
1215  {
1216  if (JSON_HEDLEY_UNLIKELY(get() != literal_text[i]))
1217  {
1218  error_message = "invalid literal";
1219  return token_type::parse_error;
1220  }
1221  }
1222  return return_type;
1223  }
1224 
1226  // input management
1228 
1230  void reset() noexcept
1231  {
1232  token_buffer.clear();
1233  token_string.clear();
1234  token_string.push_back(std::char_traits<char>::to_char_type(current));
1235  }
1236 
1237  /*
1238  @brief get next character from the input
1239 
1240  This function provides the interface to the used input adapter. It does
1241  not throw in case the input reached EOF, but returns a
1242  `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1243  for use in error messages.
1244 
1245  @return character read from the input
1246  */
1247  std::char_traits<char>::int_type get()
1248  {
1249  ++position.chars_read_total;
1250  ++position.chars_read_current_line;
1251 
1252  if (next_unget)
1253  {
1254  // just reset the next_unget variable and work with current
1255  next_unget = false;
1256  }
1257  else
1258  {
1259  current = ia->get_character();
1260  }
1261 
1262  if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1263  {
1264  token_string.push_back(std::char_traits<char>::to_char_type(current));
1265  }
1266 
1267  if (current == '\n')
1268  {
1269  ++position.lines_read;
1270  position.chars_read_current_line = 0;
1271  }
1272 
1273  return current;
1274  }
1275 
1284  void unget()
1285  {
1286  next_unget = true;
1287 
1288  --position.chars_read_total;
1289 
1290  // in case we "unget" a newline, we have to also decrement the lines_read
1291  if (position.chars_read_current_line == 0)
1292  {
1293  if (position.lines_read > 0)
1294  {
1295  --position.lines_read;
1296  }
1297  }
1298  else
1299  {
1300  --position.chars_read_current_line;
1301  }
1302 
1303  if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1304  {
1305  assert(not token_string.empty());
1306  token_string.pop_back();
1307  }
1308  }
1309 
1311  void add(int c)
1312  {
1313  token_buffer.push_back(std::char_traits<char>::to_char_type(c));
1314  }
1315 
1316  public:
1318  // value getters
1320 
1322  constexpr number_integer_t get_number_integer() const noexcept
1323  {
1324  return value_integer;
1325  }
1326 
1328  constexpr number_unsigned_t get_number_unsigned() const noexcept
1329  {
1330  return value_unsigned;
1331  }
1332 
1334  constexpr number_float_t get_number_float() const noexcept
1335  {
1336  return value_float;
1337  }
1338 
1340  string_t& get_string()
1341  {
1342  return token_buffer;
1343  }
1344 
1346  // diagnostics
1348 
1350  constexpr position_t get_position() const noexcept
1351  {
1352  return position;
1353  }
1354 
1358  std::string get_token_string() const
1359  {
1360  // escape control characters
1361  std::string result;
1362  for (const auto c : token_string)
1363  {
1364  if ('\x00' <= c and c <= '\x1F')
1365  {
1366  // escape control characters
1367  std::array<char, 9> cs{{}};
1368  (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
1369  result += cs.data();
1370  }
1371  else
1372  {
1373  // add character as is
1374  result.push_back(c);
1375  }
1376  }
1377 
1378  return result;
1379  }
1380 
1382  JSON_HEDLEY_RETURNS_NON_NULL
1383  constexpr const char* get_error_message() const noexcept
1384  {
1385  return error_message;
1386  }
1387 
1389  // actual scanner
1391 
1396  bool skip_bom()
1397  {
1398  if (get() == 0xEF)
1399  {
1400  // check if we completely parse the BOM
1401  return get() == 0xBB and get() == 0xBF;
1402  }
1403 
1404  // the first character is not the beginning of the BOM; unget it to
1405  // process is later
1406  unget();
1407  return true;
1408  }
1409 
1410  token_type scan()
1411  {
1412  // initially, skip the BOM
1413  if (position.chars_read_total == 0 and not skip_bom())
1414  {
1415  error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1416  return token_type::parse_error;
1417  }
1418 
1419  // read next character and ignore whitespace
1420  do
1421  {
1422  get();
1423  }
1424  while (current == ' ' or current == '\t' or current == '\n' or current == '\r');
1425 
1426  switch (current)
1427  {
1428  // structural characters
1429  case '[':
1430  return token_type::begin_array;
1431  case ']':
1432  return token_type::end_array;
1433  case '{':
1434  return token_type::begin_object;
1435  case '}':
1436  return token_type::end_object;
1437  case ':':
1438  return token_type::name_separator;
1439  case ',':
1440  return token_type::value_separator;
1441 
1442  // literals
1443  case 't':
1444  return scan_literal("true", 4, token_type::literal_true);
1445  case 'f':
1446  return scan_literal("false", 5, token_type::literal_false);
1447  case 'n':
1448  return scan_literal("null", 4, token_type::literal_null);
1449 
1450  // string
1451  case '\"':
1452  return scan_string();
1453 
1454  // number
1455  case '-':
1456  case '0':
1457  case '1':
1458  case '2':
1459  case '3':
1460  case '4':
1461  case '5':
1462  case '6':
1463  case '7':
1464  case '8':
1465  case '9':
1466  return scan_number();
1467 
1468  // end of input (the null byte is needed when parsing from
1469  // string literals)
1470  case '\0':
1471  case std::char_traits<char>::eof():
1472  return token_type::end_of_input;
1473 
1474  // error
1475  default:
1476  error_message = "invalid literal";
1477  return token_type::parse_error;
1478  }
1479  }
1480 
1481  private:
1483  detail::input_adapter_t ia = nullptr;
1484 
1486  std::char_traits<char>::int_type current = std::char_traits<char>::eof();
1487 
1489  bool next_unget = false;
1490 
1492  position_t position {};
1493 
1495  std::vector<char> token_string {};
1496 
1498  string_t token_buffer {};
1499 
1501  const char* error_message = "";
1502 
1503  // number values
1504  number_integer_t value_integer = 0;
1505  number_unsigned_t value_unsigned = 0;
1506  number_float_t value_float = 0;
1507 
1509  const char decimal_point_char = '.';
1510 };
1511 } // namespace detail
1512 } // namespace nlohmann
lexical analysis
Definition: lexer.hpp:32
token_type
token types for the parser
Definition: lexer.hpp:41
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ end_of_input
indicating the end of the input buffer
@ name_separator
the name separator :
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.hpp:1358
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.hpp:1340
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.hpp:1328
bool skip_bom()
skip the UTF-8 byte order mark
Definition: lexer.hpp:1396
constexpr JSON_HEDLEY_RETURNS_NON_NULL const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.hpp:1383
constexpr position_t get_position() const noexcept
return position of last read token
Definition: lexer.hpp:1350
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.hpp:1334
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.hpp:64
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.hpp:1322
std::shared_ptr< input_adapter_protocol > input_adapter_t
a type to simplify interfaces
Definition: input_adapters.hpp:49
namespace for Niels Lohmann
Definition: adl_serializer.hpp:9
struct to capture the start position of the current token
Definition: position_t.hpp:11