1 // Written in the D programming language. 2 /** 3 Haystack Zinc token lexer 4 5 Copyright: Copyright (c) 2017, Radu Racariu <radu.racariu@gmail.com> 6 License: $(LINK2 www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 Authors: Radu Racariu 8 **/ 9 module haystack.zinc.lexer; 10 import haystack.tag; 11 import haystack.util.misc; 12 import std.ascii : isLower, 13 isUpper, 14 isAlpha, 15 isAlphaNum, 16 isDigit, 17 isControl, 18 isHexDigit, 19 isWhite; 20 21 /// Types of tokens that the lexer can provide 22 enum TokenType 23 { 24 id, 25 null_, 26 marker, 27 remove, 28 na, 29 bool_, 30 ref_, 31 str, 32 uri, 33 number, 34 date, 35 time, 36 dateTime, 37 coord, 38 xstr, 39 empty = uint.max 40 } 41 42 /** 43 The result of a Lexer action. 44 */ 45 struct Token 46 { 47 /** 48 Create a token of a type and value 49 */ 50 this(TokenType type, Tag tag) 51 in (type != TokenType.empty, "Invalid token type") 52 { 53 this._type = type; 54 this.data = tag; 55 } 56 57 /** 58 Create a token of a non value type 59 */ 60 this(TokenType type) 61 in (type > TokenType.id && type < TokenType.bool_, "Invalid token type") 62 { 63 this._type = type; 64 } 65 66 static Token makeChar(dchar c) 67 { 68 return Token(TokenType.empty, c); 69 } 70 71 // Create a char token 72 private this(TokenType type, dchar c) 73 in (type == TokenType.empty, "Invalid token type") 74 { 75 this._type = TokenType.empty; 76 this._chr = c; 77 } 78 79 /** 80 Current TokenType 81 */ 82 @property TokenType type() pure const 83 { 84 return _type; 85 } 86 87 /** 88 Token's tag data 89 */ 90 @property ref const(Tag) tag() pure const 91 { 92 return data; 93 } 94 95 /** 96 Get a Tag value from the token's data 97 */ 98 const(T) value(T)() const 99 in (isValid, "Can't get value from empty token.") 100 { 101 return data.get!T; 102 } 103 104 @property dchar curChar() pure const 105 in (type == TokenType.empty, "Invalid token type") 106 { 107 return _chr; 108 } 109 110 @property bool isValid() pure const 111 { 112 return type != TokenType.empty; 113 } 114 115 bool isOf(TokenType type, Tag value) const 116 { 117 return type == type && tag == value; 118 } 119 120 bool isId() pure const 121 { 122 return type == TokenType.id ; 123 } 124 125 bool hasChr(dchar c) pure const 126 { 127 return isEmpty && _chr == c; 128 } 129 130 @property bool isEmpty() pure const 131 { 132 return type == TokenType.empty; 133 } 134 135 @property bool isSpace() pure const 136 { 137 return !isNewLine && _chr.isWhite; 138 } 139 140 @property bool isNewLine() pure const 141 { 142 return isEmpty && _chr.isWhite && _chr.isControl; 143 } 144 145 @property bool isAlpha() pure const 146 { 147 return isEmpty && _chr.isAlpha; 148 } 149 150 @property bool isAlphaNum() pure const 151 { 152 return isEmpty && _chr.isAlphaNum; 153 } 154 155 @property bool isUpper() pure const 156 { 157 return isEmpty && _chr.isUpper; 158 } 159 160 bool isScalar() pure const 161 { 162 return type >= TokenType.null_ 163 && type <= TokenType.xstr; 164 } 165 166 bool opEquals()(auto ref const(Token) tk) const 167 { 168 // optimize non-value cases 169 if (type == TokenType.empty) 170 return tk.type == TokenType.empty; 171 if (type == TokenType.null_) 172 return tk.type == TokenType.null_; 173 if (type == TokenType.marker) 174 return tk.type == TokenType.marker; 175 if (type == TokenType.remove) 176 return tk.type == TokenType.remove; 177 if (type == TokenType.na) 178 return tk.type == TokenType.na; 179 180 return type == tk.type && data == tk.data; 181 } 182 183 private: 184 TokenType _type = TokenType.empty; 185 Tag data; 186 dchar _chr; 187 } 188 189 /** 190 Lexes Zinc tokens from some char $(D InputRange) 191 */ 192 struct ZincLexer(Range) 193 if (isCharInputRange!Range) 194 { 195 this(Range r, int ver = 3) 196 { 197 this.input = LookAhead!Range(r); 198 this.ver = ver; 199 if (r.empty) 200 isEmpty = true; 201 else 202 popFront(); 203 } 204 205 @property bool empty() pure nothrow 206 { 207 return isEmpty; 208 } 209 210 @property ref const(Token) front() pure nothrow 211 { 212 return crtToken; 213 } 214 215 @property char cur() 216 { 217 return input.front; 218 } 219 220 void popFront() 221 { 222 if (input.hasStash) 223 input.clearStash(); 224 225 if (input.empty) 226 { 227 isEmpty = true; 228 return; 229 } 230 231 TokenType nextToken; 232 char startChr = cur; 233 234 loop: 235 while (!input.empty) 236 { 237 // short circuit special chars 238 if (startChr.isWhite || startChr.isControl) 239 { 240 if (startChr == '\r') // normalize CR-LF 241 { 242 input.popFront(); 243 startChr = cur; 244 continue loop; 245 } 246 crtToken = Token.makeChar(startChr); 247 return input.popFront(); 248 } 249 250 switch (nextToken) 251 { 252 case TokenType.id: 253 if (lexId()) 254 break loop; 255 nextToken = TokenType.null_; 256 continue loop; 257 258 case TokenType.null_: 259 if (lexNull()) 260 break loop; 261 nextToken = TokenType.marker; 262 continue loop; 263 264 case TokenType.marker: 265 if (lexMarker()) 266 break loop; 267 nextToken = TokenType.remove; 268 continue loop; 269 270 case TokenType.remove: 271 if (lexRemove()) 272 break loop; 273 nextToken = TokenType.na; 274 continue loop; 275 276 case TokenType.na: 277 if (lexNa()) 278 break loop; 279 nextToken = TokenType.bool_; 280 continue loop; 281 282 case TokenType.bool_: 283 if (lexBool()) 284 break loop; 285 nextToken = TokenType.ref_; 286 continue loop; 287 288 case TokenType.ref_: 289 if (lexRef()) 290 break loop; 291 nextToken = TokenType.str; 292 continue loop; 293 294 case TokenType.str: 295 if (lexStr()) 296 break loop; 297 nextToken = TokenType.uri; 298 continue loop; 299 300 case TokenType.uri: 301 if (lexUri()) 302 break loop; 303 nextToken = TokenType.number; 304 continue loop; 305 306 case TokenType.number: 307 if (lexNumber()) 308 { 309 if (input.crtStash.length <= 4) // verify if this isn't a date 310 { 311 if (lexDateTime() || lexTime()) 312 break loop; 313 } 314 break loop; 315 } 316 else 317 { 318 nextToken = TokenType.dateTime; 319 continue loop; 320 } 321 322 case TokenType.dateTime: // the date part can be parsed here, so try both 323 if (lexDateTime()) 324 break loop; 325 nextToken = TokenType.time; 326 continue loop; 327 328 case TokenType.time: 329 if (lexTime()) 330 break loop; 331 nextToken = TokenType.coord; 332 continue loop; 333 334 case TokenType.coord: 335 if (lexCoord()) 336 break loop; 337 nextToken = TokenType.xstr; 338 continue loop; 339 340 case TokenType.xstr: 341 if (ver < 3 && lexBin()) 342 break loop; 343 else if (lexXStr()) 344 break loop; 345 goto default; 346 347 default: 348 crtToken = Token.makeChar(startChr); 349 if (input.hasStash) 350 return input.save(); 351 if (!input.empty) 352 input.popFront(); 353 break loop; 354 } 355 } 356 } 357 358 @property ref Range range() scope return 359 { 360 return input.range; 361 } 362 363 @property void range(scope ref Range r) 364 { 365 input.range = r; 366 crtToken = Token.makeChar(r.front); 367 } 368 369 // zinc spec version 370 int ver = 3; 371 372 // internals 373 package(haystack): 374 375 @disable this(); 376 @disable this(this); 377 378 @property ref buffer() scope return 379 { 380 return input; 381 } 382 383 bool isEmpty = false; 384 385 bool lexId() 386 { 387 enum State { fistChar, restChars } 388 loop: 389 for (State crtState; !input.empty; input.popFront()) 390 { 391 final switch (crtState) 392 { 393 case State.fistChar: // required to start with lower case alpha 394 if (!cur.isLower) 395 return false; 396 input.stash(); 397 crtState++; 398 continue; 399 400 case State.restChars: 401 if (isXStrChar) 402 input.stash(); 403 else 404 break loop; 405 } 406 } 407 crtToken = Token(TokenType.id, input.commitStash().tag); 408 return true; 409 } 410 unittest 411 { 412 // good 413 assertTokenValue("abcAbcD123_wwwe", Token(TokenType.id, "abcAbcD123_wwwe".tag)); 414 assertTokenValue("idFoo@", Token(TokenType.id, "idFoo".tag)); 415 assertTokenValue("idBar ", Token(TokenType.id, "idBar".tag)); 416 assertTokenValue("someId,", Token(TokenType.id, "someId".tag)); 417 // bad 418 assertTokenEmpty("BAD%Id"); 419 } 420 421 bool lexNull() 422 { 423 if (cur != 'N') 424 return false; 425 426 // probe if this has more 427 input.stash(); 428 input.popFront(); 429 if (!input.empty && isXStrChar) 430 { 431 input.save(); // save look ahead 432 return false; 433 } 434 435 crtToken = Token(TokenType.null_, Tag()); 436 return true; 437 } 438 unittest 439 { 440 // good 441 assertTokenValue("N", Token(TokenType.null_)); 442 assertTokenValue("N ", Token(TokenType.null_)); 443 assertTokenValue("N,", Token(TokenType.null_)); 444 // bad 445 assertTokenEmpty("X"); 446 assertTokenEmpty("Nx"); 447 } 448 449 bool lexMarker() 450 { 451 if (cur != 'M') 452 return false; 453 454 input.stash(); 455 input.popFront(); 456 if (!input.empty && isXStrChar) 457 { 458 input.save; 459 return false; 460 } 461 462 crtToken = Token(TokenType.marker, marker()); 463 return true; 464 } 465 unittest 466 { 467 // good 468 assertTokenValue("M", Token(TokenType.marker)); 469 assertTokenValue("M ", Token(TokenType.marker)); 470 assertTokenValue("M|", Token(TokenType.marker)); 471 // bad 472 assertTokenEmpty("Y"); 473 } 474 475 bool lexRemove() 476 { 477 if (cur != 'R') 478 return false; 479 480 input.stash(); 481 input.popFront(); 482 if (!input.empty && isXStrChar) 483 { 484 input.save; 485 return false; 486 } 487 488 crtToken = Token(TokenType.remove, Tag.init); 489 return true; 490 } 491 unittest 492 { 493 // good 494 assertTokenValue("R", Token(TokenType.remove)); 495 assertTokenValue("R ", Token(TokenType.remove)); 496 assertTokenValue("R,", Token(TokenType.remove)); 497 // bad 498 assertTokenEmpty("K"); 499 } 500 501 bool lexNa() 502 { 503 if (cur != 'N') 504 return false; 505 506 input.stash(); 507 input.popFront(); 508 input.stash(); 509 if (cur != 'A') 510 { 511 // more to lex 512 input.popFront(); 513 input.save(); 514 return false; 515 } 516 517 if (!input.empty) 518 { 519 input.popFront(); 520 if (!input.empty && isXStrChar) 521 { 522 input.stash(); 523 input.save; 524 return false; 525 } 526 } 527 crtToken = Token(TokenType.na, Na().Tag); 528 return true; 529 } 530 unittest 531 { 532 // good 533 assertTokenValue("NA", Token(TokenType.na)); 534 assertTokenValue("NA ", Token(TokenType.na)); 535 assertTokenValue("NA,", Token(TokenType.na)); 536 // bad 537 assertTokenEmpty("NAM,"); 538 assertTokenEmpty("XY"); 539 } 540 541 bool lexBool() 542 { 543 if (cur != 'T' && cur != 'F') 544 return false; 545 546 const val = (cur == 'T'); 547 548 input.stash(); 549 input.popFront(); 550 if (!input.empty && isXStrChar) 551 { 552 input.save; 553 return false; 554 } 555 556 crtToken = Token(TokenType.bool_, val.tag); 557 return true; 558 } 559 unittest 560 { 561 // good 562 assertTokenValue("T", Token(TokenType.bool_, true.tag)); 563 assertTokenValue("T\t", Token(TokenType.bool_, true.tag)); 564 assertTokenValue("F", Token(TokenType.bool_, false.tag)); 565 assertTokenValue("F,", Token(TokenType.bool_, false.tag)); 566 // bad 567 assertTokenEmpty("K"); 568 } 569 570 bool lexRef() 571 { 572 if (cur != '@') 573 return false; 574 575 string val; 576 string dis; 577 578 for (input.popFront(); !input.empty; input.popFront()) 579 { 580 if (cur.isAlphaNum 581 || cur == '_' 582 || cur == ':' 583 || cur == '-' 584 || cur == '.' 585 || cur == '~') 586 { 587 input.stash(); 588 } 589 else if ((cur.isWhite && !cur.isControl) && input.hasStash) 590 { 591 input.popFront(); // skip ws 592 val = input.commitStash(); 593 if (lexStr()) 594 { 595 dis = crtToken.value!Str; 596 crtToken = Token(); 597 } 598 else 599 { 600 input.save(); 601 } 602 break; 603 } 604 else if (input.hasStash) 605 { 606 break; 607 } 608 else 609 { 610 return false; 611 } 612 } 613 if (val is null) 614 { 615 if (!input.hasStash) 616 return false; 617 val = input.commitStash(); 618 } 619 crtToken = Token(TokenType.ref_, Tag(Ref(val, dis))); 620 return true; 621 } 622 unittest 623 { 624 // good 625 assertTokenValue("@fooBar,", Token(TokenType.ref_, Ref("fooBar").Tag)); 626 assertTokenValue(`@fooBar "a nice description"`, Token(TokenType.ref_, Ref("fooBar", "a nice description").Tag)); 627 assertTokenValue(`@fooBar ,`, Token(TokenType.ref_, Ref("fooBar").Tag)); 628 // bad 629 assertTokenEmpty("@"); 630 assertTokenEmpty("&"); 631 assertTokenEmpty("@#"); 632 } 633 634 string lexChars(immutable char[] esc, immutable char[] escVal, char delim = '"') 635 in (esc.length == escVal.length) 636 { 637 import std.format : formattedRead; 638 import std.string : indexOf; 639 640 if (cur != delim) 641 return null; 642 643 bool hasTerm = false; 644 for (input.popFront(); !input.empty; input.popFront()) 645 { 646 loop: 647 if (cur == delim) // found terminator 648 { 649 hasTerm = true; 650 input.popFront(); 651 break; 652 } 653 654 if (cur < ' ') 655 return null; 656 657 if (cur != '\\') 658 { 659 input.stash(); 660 } 661 else 662 { 663 if (input.empty) 664 return null; 665 666 input.popFront(); 667 if (cur == 'u') 668 { 669 if (input.empty) 670 return null; 671 input.popFront(); 672 if (input.empty || !cur.isHexDigit) 673 return null; 674 dchar unicodeChar; 675 int count = input.formattedRead("%x", &unicodeChar); 676 if (!count) 677 return null; 678 input.stash(unicodeChar); 679 // we consumed all u's chars, no need to popFront 680 goto loop; 681 } 682 ptrdiff_t escPos = esc.indexOf(cur); 683 if (escPos != -1) 684 input.stash(escVal[escPos]); 685 else 686 return null; 687 } 688 } 689 if (!hasTerm) 690 return null; 691 if (!input.hasStash) 692 return ""; 693 return input.commitStash(); 694 } 695 696 bool lexStr() 697 { 698 enum delim = '"'; 699 static immutable strEsc = [ 'n', 'r', 't', '"', '\\', '$', 'b', 'f']; 700 static immutable strEscVal = ['\n', '\r', '\t', '"', '\\', '$', '\b', '\f']; 701 702 string chars = lexChars(strEsc, strEscVal, delim); 703 if (chars is null) 704 return false; 705 706 crtToken = Token(TokenType.str, chars.tag); 707 return true; 708 } 709 unittest 710 { 711 // good 712 assertTokenValue(`"hello world"`, Token(TokenType.str, "hello world".tag)); 713 assertTokenValue(`"a line\nsome\ttab"`, Token(TokenType.str, "a line\nsome\ttab".tag)); 714 assertTokenValue(`""`, Token(TokenType.str, "".tag)); 715 assertTokenValue(`"some unicode char: \u00E6"`, Token(TokenType.str, "some unicode char: æ".tag)); 716 assertTokenValue(`"inline unicode char: 語"`, Token(TokenType.str, "inline unicode char: 語".tag)); 717 // bad 718 assertTokenEmpty(`"`); 719 assertTokenEmpty(`"fooo`); 720 assertTokenEmpty(`"a bad \u"`); 721 } 722 723 bool lexUri() 724 { 725 enum delim = '`'; 726 static immutable uriEsc = [':', '/', '?', '#', '[', ']', '@', '`', '\\', '&', '=', ';']; 727 static immutable uriEscVal = [':', '/', '?', '#', '[', ']', '@', '`', '\\', '&', '=', ';']; 728 string chars = lexChars(uriEsc, uriEscVal, delim); 729 if (chars !is null) 730 { 731 crtToken = Token(TokenType.uri, cast(Tag) Uri(chars)); 732 return true; 733 } 734 return false; 735 } 736 unittest 737 { 738 // good 739 assertTokenValue("`/a/b/c`", Token(TokenType.uri, cast(Tag) Uri("/a/b/c"))); 740 // bad 741 assertTokenEmpty("`"); 742 } 743 744 bool lexNumber() 745 { 746 import std.math : isNaN; 747 enum State { integral, fractionalDigit, fractional, expSign, exp, unit } 748 749 // test the optional sign 750 if (cur == '-') 751 { 752 input.stash(); 753 input.popFront(); 754 if (input.empty) 755 return false; 756 if (!cur.isDigit && input.find("INF")) 757 { 758 crtToken = Token(TokenType.number, tag(-1 * double.infinity)); 759 return true; 760 } 761 } 762 // lex number parts 763 if (cur.isDigit) 764 { 765 static void parseNum(const(char)[] chars, ref double val) 766 { 767 import std.format : formattedRead; 768 chars.formattedRead("%g", &val); 769 } 770 771 bool isUnit() 772 { 773 return cur.isAlpha || cur == '%' || cur == '_' || cur == '/' || cur == '$' || cur > 127; 774 } 775 776 double value; 777 string unit; 778 State crtState = State.integral; 779 input.stash(); 780 loop: 781 for (input.popFront(); !input.empty; input.popFront()) 782 { 783 final switch (crtState) 784 { 785 case State.integral: 786 if (cur.isDigit) 787 { 788 input.stash(); 789 } 790 else if (cur == '_') 791 { 792 continue; 793 } 794 else if (cur == '.') 795 { 796 input.stash(); 797 crtState = State.fractionalDigit; 798 } 799 else if (isUnit) 800 { 801 parseNum(input.crtStash, value); 802 input.clearStash(); 803 input.stash(); 804 crtState = State.unit; 805 } 806 else 807 { 808 // save current scratch buffer and try matching for date or time 809 if (input.crtStash.length == 4 && cur == '-' 810 || input.crtStash.length == 2 && cur == ':') 811 input.save(); 812 break loop; 813 } 814 break; 815 816 case State.fractionalDigit: 817 if (cur.isDigit) 818 { 819 input.stash(); 820 crtState = State.fractional; 821 } 822 else 823 { 824 return false; 825 } 826 break; 827 828 case State.fractional: 829 if (cur.isDigit) 830 { 831 input.stash(); 832 } 833 else if (cur == '_') 834 { 835 continue; 836 } 837 else if (cur == 'e' || cur == 'E') 838 { 839 input.stash(); 840 crtState = State.expSign; 841 } 842 else if (isUnit) 843 { 844 parseNum(input.crtStash, value); 845 input.clearStash(); 846 input.stash(); 847 crtState = State.unit; 848 } 849 else 850 { 851 // nothing to process 852 break loop; 853 } 854 break; 855 856 case State.expSign: 857 if (cur == '+' || cur == '-' || cur.isDigit) 858 { 859 input.stash(); 860 crtState = State.exp; 861 } 862 else 863 { 864 return false; 865 } 866 break; 867 868 case State.exp: 869 if (cur.isDigit) 870 { 871 input.stash(); 872 } 873 else if (isUnit) 874 { 875 parseNum(input.crtStash, value); 876 input.clearStash(); 877 input.stash(); 878 crtState = State.unit; 879 } 880 else 881 { 882 // nothing to process 883 break loop; 884 } 885 break; 886 887 case State.unit: 888 if (isUnit) 889 { 890 input.stash(); 891 } 892 else 893 { 894 // nothing to process 895 break loop; 896 } 897 break; 898 } 899 } 900 901 if (value.isNaN) 902 parseNum(input.crtStash, value); 903 else 904 unit = input.commitStash(); 905 906 crtToken = Token(TokenType.number, Num(value, unit).Tag); 907 return true; 908 } 909 else if (input.find("INF")) 910 { 911 crtToken = Token(TokenType.number, tag(double.infinity)); 912 return true; 913 } 914 else if (input.crtStash == "Na") 915 { 916 input.clearStash(); 917 if (input.empty) 918 return false; 919 920 if (cur == 'N') 921 { 922 input.popFront(); 923 crtToken = Token(TokenType.number, tag(double.nan)); 924 return true; 925 } 926 } 927 return false; 928 } 929 unittest 930 { 931 assertTokenValue("-INF", Token(TokenType.number, (-1 * double.infinity).tag)); 932 assertTokenValue("INF", Token(TokenType.number, (double.infinity).tag)); 933 assertTokenIsNan("NaN"); 934 assertTokenIsNan("NaN,"); 935 assertTokenValue("100", Token(TokenType.number, 100.tag)); 936 assertTokenValue("-88", Token(TokenType.number, (-88).tag)); 937 assertTokenValue("-99.", Token(TokenType.number, (-99.0).tag)); 938 assertTokenValue("42.42", Token(TokenType.number, (42.42).tag)); 939 assertTokenValue("9.6e+10", Token(TokenType.number, (96000000000).tag)); 940 assertTokenValue("100%", Token(TokenType.number, Num(100, "%").Tag)); 941 assertTokenValue("100$", Token(TokenType.number, Num(100, "$").Tag)); 942 // bad 943 assertTokenEmpty("-"); 944 assertTokenEmpty("Na"); 945 assertTokenEmpty("IN"); 946 assertTokenEmpty("_12"); 947 } 948 949 bool lexDate() 950 { 951 import std.conv : to; 952 enum State { year, month, day } 953 954 State crtState; 955 int year, month, day; 956 int parts = 0; 957 958 for (; !input.empty && parts < 8; input.popFront()) 959 { 960 final switch (crtState) 961 { 962 case State.year: 963 if (cur.isDigit) 964 { 965 input.stash(); 966 if (++parts > 4) // to many digits 967 return false; 968 } 969 else if (cur == '-' && parts == 4) 970 { 971 year = to!int(input.crtStash()); 972 input.clearStash(); 973 crtState++; 974 } 975 else if (parts > 0) // keep the stashed digits 976 { 977 input.save(); 978 return false; 979 } 980 else // no match 981 { 982 return false; 983 } 984 break; 985 986 case State.month: 987 if (cur.isDigit) 988 { 989 input.stash(); 990 if (++parts > 6) 991 return false; 992 } 993 else if (cur == '-' && parts == 6) 994 { 995 month = to!int(input.crtStash()); 996 input.clearStash(); 997 crtState++; 998 } 999 else 1000 { 1001 return false; 1002 } 1003 break; 1004 1005 case State.day: 1006 if (!cur.isDigit) 1007 return false; 1008 1009 input.stash(); 1010 if (++parts == 8) 1011 { 1012 day = to!int(input.crtStash()); 1013 input.clearStash(); 1014 } 1015 break; 1016 } 1017 } 1018 if (crtState < State.day) 1019 return false; 1020 crtToken = Token(TokenType.date, Date(year, month, day).Tag); 1021 return true; 1022 } 1023 unittest 1024 { 1025 // good 1026 assertTokenValue("2017-01-15", Token(TokenType.date, Date(2017, 01, 15).Tag)); 1027 assertTokenValue("2900-12-31", Token(TokenType.date, Date(2900, 12, 31).Tag)); 1028 assertTokenValue("2900-12-31234", Token(TokenType.date, Date(2900, 12, 31).Tag)); 1029 // bad 1030 assertTokenValue("200017-111-22", Token(TokenType.number, 200017.tag)); 1031 assertTokenValue("2-1-2", Token(TokenType.number, 2.tag)); 1032 assertTokenValue("2017_1-2", Token(TokenType.number, 20171.tag)); 1033 } 1034 1035 bool lexTime() 1036 { 1037 import std.conv : to; 1038 enum State { hours, minutes, sec, dot, fraction } 1039 1040 State crtState; 1041 int hours, minutes, sec, fraction; 1042 int parts = 0; 1043 1044 if (input.empty) 1045 return false; 1046 1047 loop: 1048 for (; !input.empty; input.popFront()) 1049 { 1050 final switch (crtState) 1051 { 1052 case State.hours: 1053 if (cur.isDigit) // check the 2nd digit of the hours number 1054 { 1055 input.stash(); 1056 if (++parts > 2) 1057 return false; 1058 } 1059 else if (cur == ':' && parts == 2) // got the 2 hour numbers and sep 1060 { 1061 hours = to!int(input.crtStash()); 1062 input.clearStash(); 1063 crtState++; 1064 } 1065 // no separator found 1066 else 1067 { 1068 return false; 1069 } 1070 break; 1071 1072 case State.minutes: 1073 if (cur.isDigit) 1074 { 1075 input.stash(); 1076 if (++parts > 4) 1077 return false; 1078 } 1079 else if (cur == ':' && parts == 4) 1080 { 1081 minutes = to!int(input.crtStash()); 1082 input.clearStash(); 1083 crtState++; 1084 } 1085 else 1086 { 1087 return false; // no separator found 1088 } 1089 break; 1090 1091 case State.sec: 1092 if (cur.isDigit) 1093 { 1094 input.stash(); 1095 parts++; 1096 } 1097 else 1098 { 1099 return false; 1100 } 1101 1102 if (parts == 6) 1103 { 1104 sec = to!int(input.crtStash()); 1105 input.clearStash(); 1106 crtState++; 1107 } 1108 break; 1109 1110 case State.dot: 1111 if (cur != '.') 1112 break loop; 1113 crtState++; 1114 break; 1115 1116 case State.fraction: 1117 if (cur.isDigit) 1118 { 1119 input.stash(); 1120 parts++; 1121 } 1122 else 1123 { 1124 if (!input.hasStash) 1125 return false; 1126 break loop; 1127 } 1128 break; 1129 } 1130 } 1131 if (parts < 6) 1132 return false; 1133 if (crtState == State.fraction && input.hasStash) 1134 { 1135 fraction = to!int(input.crtStash()); 1136 input.clearStash(); 1137 } 1138 crtToken = Token(TokenType.date, Time(hours, minutes, sec, fraction).Tag); 1139 return true; 1140 } 1141 unittest 1142 { 1143 // good 1144 assertTokenValue("09:40:03", Token(TokenType.date, Time(9, 40, 3).Tag)); 1145 assertTokenValue("23:59:59", Token(TokenType.date, Time(23, 59, 59).Tag)); 1146 assertTokenValue("23:59:59.999", Token(TokenType.date, Time(23, 59, 59, 999).Tag)); 1147 // bad 1148 assertTokenValue("7000:00", Token(TokenType.number, 7000.tag)); 1149 assertTokenValue("8:00", Token(TokenType.number, 8.tag)); 1150 assertTokenValue("05:12", Token(TokenType.number, 5.tag)); 1151 assertTokenValue("23:", Token(TokenType.number, 23.tag)); 1152 } 1153 1154 // used for both Date and DateTime lexing 1155 bool lexDateTime() 1156 { 1157 import core.time : msecs; 1158 import std.datetime : UTC; 1159 import haystack.util.tzdata : timeZone; 1160 1161 if (!lexDate()) // try the date part 1162 return false; 1163 if (input.empty || cur != 'T') // got only the date part 1164 return true; 1165 1166 Tag date = crtToken.data; 1167 1168 input.clearStash(); // clear the date stash 1169 input.popFront(); // move next 1170 crtToken = Token(); 1171 if (input.empty) // it must have more 1172 return false; 1173 1174 if (!lexTime()) // get the time part 1175 return false; 1176 1177 if (input.empty) 1178 return false; 1179 1180 Tag time = crtToken.data; 1181 input.clearStash(); // clear the time stash 1182 crtToken = Token(); 1183 1184 enum State {utc, hours, minutes, tz} 1185 1186 int tkCount = 0; 1187 string offset; 1188 loop: 1189 for (State crtState; !input.empty; input.popFront()) 1190 { 1191 final switch (crtState) 1192 { 1193 case State.utc: 1194 if (cur == 'Z') // end of utc date time 1195 { 1196 if (input.empty) 1197 break loop; // done 1198 crtState = State.tz; 1199 continue; // parse the UTC tz name 1200 } 1201 else if (cur == '-' || cur == '+') // offset 1202 { 1203 input.stash(); 1204 crtState = State.hours; 1205 } 1206 else 1207 { 1208 return false; 1209 } 1210 break; 1211 1212 case State.hours: 1213 if (cur.isDigit) // offset hours 1214 { 1215 input.stash(); 1216 tkCount++; 1217 continue; 1218 } 1219 else if (tkCount < 2) // must be number 1220 { 1221 return false; 1222 } 1223 if (cur == ':' && tkCount == 2) // got 2 numbers and a separator 1224 { 1225 input.stash(); 1226 tkCount = 0; 1227 crtState = State.minutes; 1228 continue; 1229 } 1230 // no separator found 1231 if (tkCount > 2) 1232 return false; 1233 break; 1234 1235 case State.minutes: 1236 if (cur.isDigit) // minutes number 1237 { 1238 input.stash(); 1239 if (++tkCount == 2) // found the minutes number 1240 { 1241 offset = input.commitStash(); 1242 tkCount = 0; 1243 crtState = State.tz; 1244 } 1245 continue; 1246 } 1247 else if (tkCount < 2) // must be number 1248 { 1249 return false; 1250 } 1251 break; 1252 1253 case State.tz: 1254 if (tkCount == 0) 1255 { 1256 if (cur == ' ') 1257 { 1258 tkCount++; 1259 continue; 1260 } 1261 else 1262 { 1263 break loop; 1264 } 1265 } 1266 1267 if (tkCount == 1) // ensure tz starts with an alpha 1268 { 1269 if (cur.isAlpha) 1270 { 1271 input.stash(); 1272 tkCount++; 1273 continue; 1274 } 1275 else 1276 { 1277 return false; // invalid tz start 1278 } 1279 } 1280 else if (cur.isAlpha 1281 || cur == '/' 1282 || cur == '_' 1283 || cur == '-' 1284 || cur == '+' ) // the rest of tz chars 1285 { 1286 input.stash(); 1287 } 1288 else // found all 1289 { 1290 break loop; 1291 } 1292 1293 break; 1294 } 1295 } 1296 1297 string tzName = input.commitStash(); 1298 DateTime dt = DateTime(date.get!Date, time.get!Time); 1299 1300 if (tzName.empty || tzName == "UTC") 1301 { 1302 crtToken = Token(TokenType.dateTime, SysTime(dt, msecs((time.get!Time).millis), UTC()).Tag); 1303 } 1304 else 1305 { 1306 try 1307 { 1308 auto tz = timeZone(tzName); 1309 crtToken = Token(TokenType.dateTime, SysTime(dt, tz).Tag); 1310 } 1311 catch(Exception e) 1312 { 1313 import std.conv : to; 1314 import std.algorithm : filter; 1315 import std.string : indexOf; 1316 immutable gmtTz = "Etc/GMT" ~ offset[0..offset.indexOf(':')].filter!(c => c != '0').to!string(); 1317 auto tz = timeZone(gmtTz); 1318 crtToken = Token(TokenType.dateTime, SysTime(dt, tz).Tag); 1319 } 1320 } 1321 return true; // done 1322 } 1323 unittest 1324 { 1325 import core.time : msecs; 1326 import std.datetime : TimeZone, UTC; 1327 1328 // good 1329 assertTokenValue("2017-01-17T13:51:20Z", Token(TokenType.dateTime, SysTime(DateTime(2017, 1, 17, 13, 51, 20), UTC()).Tag)); 1330 assertTokenValue("2009-11-09T15:39:00Z", Token(TokenType.dateTime, SysTime(DateTime(2009, 11, 9, 15, 39, 0), UTC()).Tag)); 1331 assertTokenValue("1989-12-21T15:39:00Z UTC", Token(TokenType.dateTime, SysTime(DateTime(1989, 12, 21, 15, 39, 0), UTC()).Tag)); 1332 assertTokenValue("2015-03-31T18:06:41.956Z", Token(TokenType.dateTime, SysTime(DateTime(2015, 3, 31, 18, 6, 41), msecs(956), UTC()).Tag)); 1333 1334 import haystack.util.tzdata; 1335 assertTokenValue("2010-08-31T08:45:00+02:00 Europe/Athens", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("Europe/Athens")).Tag)); 1336 assertTokenValue("2010-08-31T08:45:00-05:00 New_York", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("New_York")).Tag)); 1337 assertTokenValue("2010-08-31T08:45:00+02:00 Nicosia", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("Asia/Nicosia")).Tag)); 1338 // bad 1339 assertTokenEmpty("2009-11-09T"); 1340 assertTokenEmpty("2009-11-09T4"); 1341 } 1342 1343 bool lexCoord() 1344 { 1345 double lat, lng; 1346 enum State { coord, paran, lat, lng, done } 1347 State state; 1348 1349 loop: 1350 for (; !input.empty; input.popFront()) 1351 { 1352 final switch (state) 1353 { 1354 case State.coord: 1355 if (cur != 'C') 1356 return false; 1357 1358 input.stash(); 1359 state = State.paran; 1360 continue; 1361 1362 case State.paran: 1363 if (cur != '(') 1364 return false; 1365 1366 input.stash(); 1367 state = State.lat; 1368 continue; 1369 1370 case State.lat: 1371 if (!cur.isDigit && cur != '-') 1372 return false; 1373 1374 input.clearStash(); 1375 1376 if (!lexNumber()) 1377 return false; 1378 if (cur != ',') 1379 return false; 1380 1381 lat = crtToken.data.get!Num; 1382 input.clearStash(); 1383 state = State.lng; 1384 continue; 1385 1386 case State.lng: 1387 if (!lexNumber()) 1388 return false; 1389 if (cur != ')') 1390 return false; 1391 lng = crtToken.data.get!Num; 1392 state = State.done; 1393 continue; 1394 1395 case State.done: 1396 break loop; 1397 } 1398 } 1399 if (state != State.done) 1400 return false; 1401 crtToken = Token(TokenType.coord, Coord(lat, lng).Tag); 1402 return true; 1403 } 1404 unittest 1405 { 1406 // good 1407 assertTokenValue("C(37.545826,-77.449188), ", Token(TokenType.coord, Coord(37.545826,-77.449188).Tag)); 1408 // bad 1409 assertTokenEmpty(`C`); 1410 assertTokenEmpty(`C()`); 1411 assertTokenEmpty(`C(42.3)`); 1412 assertTokenEmpty(`C(42.3,)`); 1413 } 1414 1415 bool lexXStr() 1416 { 1417 enum State { firstChar, restChars, enc, done } 1418 1419 State crtState; 1420 string type; 1421 string data; 1422 1423 loop: 1424 for(; !input.empty; input.popFront()) 1425 { 1426 final switch (crtState) 1427 { 1428 case State.firstChar: 1429 if (!cur.isUpper) 1430 return false; 1431 1432 input.stash(); 1433 crtState = State.restChars; 1434 break; 1435 1436 case State.restChars: 1437 if (cur.isAlphaNum || cur == '_') 1438 { 1439 input.stash(); 1440 continue; 1441 } 1442 1443 if (cur != '(') 1444 return false; 1445 1446 type = input.commitStash(); 1447 crtState = State.enc; 1448 break; 1449 1450 case State.enc: 1451 if (!lexStr()) // consumes the string 1452 return false; 1453 data = crtToken.data.get!Str; 1454 crtToken = Token(); 1455 // check next char 1456 if (!input.empty && cur == ')') 1457 { 1458 crtState = State.done; 1459 continue; 1460 } 1461 else 1462 { 1463 return false; 1464 } 1465 1466 case State.done: 1467 break loop; 1468 } 1469 } 1470 if (crtState != State.done) 1471 return false; 1472 crtToken = Token(TokenType.xstr, XStr(type, data).Tag); 1473 return true; 1474 } 1475 unittest 1476 { 1477 // good 1478 assertTokenValue(`FooBar("alabala")`, Token(TokenType.xstr, XStr("FooBar", "alabala").Tag)); 1479 assertTokenValue(`Massive("\n")`, Token(TokenType.xstr, XStr("Massive", "\n").Tag)); 1480 assertTokenValue(`Bin("mimeType"),`, Token(TokenType.xstr, XStr("Bin", "mimeType").Tag)); 1481 // bad 1482 assertTokenEmpty(`Xx(")`); 1483 assertTokenEmpty(`Yx(""`); 1484 } 1485 1486 // lexer for legacy Bin tag 1487 bool lexBin() 1488 { 1489 enum State { bin, mime, done } 1490 State state; 1491 1492 loop: 1493 for(; !input.empty; input.popFront()) 1494 { 1495 final switch (state) 1496 { 1497 case State.bin: 1498 if (!input.find("Bin(", true)) 1499 return false; 1500 1501 if (cur == '"') // found possible XStr 1502 { 1503 input.save(); 1504 return false; 1505 } 1506 input.clearStash(); 1507 input.stash(); 1508 state = State.mime; 1509 break; 1510 1511 case State.mime: 1512 if (cur != ')') 1513 { 1514 input.stash(); 1515 continue; 1516 } 1517 state = State.done; 1518 break loop; 1519 1520 case State.done: 1521 assert(false); 1522 } 1523 } 1524 if (state != State.done) 1525 return false; 1526 string data = input.commitStash(); 1527 crtToken = Token(TokenType.xstr, XStr("Bin", data).Tag); 1528 return true; 1529 } 1530 unittest 1531 { 1532 assertTokenValue(`Bin(text/plain),`, Token(TokenType.xstr, XStr("Bin", "text/plain").Tag), 2); 1533 assertTokenEmpty(`Bad(text/plain),`, 2); 1534 } 1535 1536 private: 1537 1538 // test for a posible XStr part 1539 @property bool isXStrChar() 1540 { 1541 return cur.isAlphaNum || cur == '_'; 1542 } 1543 1544 // The current decoded token 1545 Token crtToken; 1546 // The look-ahead range 1547 LookAhead!Range input = void; 1548 } 1549 /// a string based lexer 1550 alias ZincStringLexer = ZincLexer!string; 1551 // bootstraps the rest of the lexer's unit tests 1552 unittest 1553 { 1554 auto l = ZincStringLexer(""); 1555 } 1556 // test if provided string data decodes to the provided Token value 1557 private void assertTokenValue(string data, Token value, int ver = 3) 1558 { 1559 auto lex = ZincStringLexer(data, ver); 1560 assert(lex.front() == value, "Failed expecting: " ~ value.tag.toStr ~ " got: " ~ lex.front().tag.toStr); 1561 } 1562 1563 // test if provided string data decodes to the provided Token type 1564 private void assertTokenType(string data, TokenType value, int ver = 3) 1565 { 1566 auto lex = ZincStringLexer(data, ver); 1567 assert(lex.front().type == value); 1568 } 1569 1570 // test if provided string data decodes to the provided Token value 1571 private void assertTokenIsNan(string data, int ver = 3) 1572 { 1573 auto lex = ZincStringLexer(data, ver); 1574 assert(lex.front().data.get!Num.isNaN); 1575 } 1576 // test if the provided string data can not be decoded 1577 private void assertTokenEmpty(string data, int ver = 3) 1578 { 1579 auto lex = ZincStringLexer(data, ver); 1580 assert(lex.front().isEmpty); 1581 } 1582 1583 package void dumpLexer(Lexer)(auto ref Lexer lex) 1584 { 1585 import std.algorithm : move; 1586 import std.stdio : writeln; 1587 import std.conv : to; 1588 foreach (ref tk; lex.move()) 1589 { 1590 writeln("Token type: ", tk.type, ", value: ", tk.type != TokenType.empty ? tk.tag.toStr() : "'" ~ to!string(tk.curChar) ~ "'"); 1591 } 1592 }