haystack.zinc.lexer source code

1 // Written in the D programming language.
2 /**
3 Haystack Zinc token lexer
4 
5 Copyright: Copyright (c) 2017, Radu Racariu <radu.racariu@gmail.com>
6 License:   $(LINK2 www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 Authors:   Radu Racariu
8 **/
9 module haystack.zinc.lexer;
10 import haystack.tag;
11 import haystack.util.misc;
12 import std.ascii            : isLower, 
13                               isUpper,
14                               isAlpha,
15                               isAlphaNum,
16                               isDigit,
17                               isControl,
18                               isHexDigit,
19                               isWhite;
20 
21 /// Types of tokens that the lexer can provide
22 enum TokenType 
23 { 
24     id, 
25     null_,
26     marker,
27     remove,
28     na,
29     bool_, 
30     ref_, 
31     str, 
32     uri, 
33     number, 
34     date, 
35     time, 
36     dateTime, 
37     coord, 
38     xstr,
39     empty = uint.max
40 }
41 
42 /**
43 The result of a Lexer action.
44 */
45 struct Token
46 {
47     /**
48     Create a token of a type and value
49     */
50     this(TokenType type, Tag tag)
51     in (type != TokenType.empty, "Invalid token type")
52     {
53         this._type  = type;
54         this.data   = tag;
55     }
56 
57     /**
58     Create a token of a non value type
59     */
60     this(TokenType type)
61     in (type > TokenType.id && type < TokenType.bool_, "Invalid token type")
62     {
63         this._type  = type;
64     }
65 
66     static Token makeChar(dchar c)
67     {
68         return Token(TokenType.empty, c);
69     }
70 
71     // Create a char token
72     private this(TokenType type, dchar c)
73     in (type == TokenType.empty, "Invalid token type")
74     {
75         this._type  = TokenType.empty;
76         this._chr   = c;
77     }
78 
79     /**
80     Current TokenType
81     */
82     @property TokenType type() pure const
83     {
84         return _type;
85     }
86 
87     /**
88     Token's tag data
89     */
90     @property ref const(Tag) tag() pure const
91     {
92         return data;
93     }
94 
95     /**
96     Get a Tag value from the token's data
97     */
98     const(T) value(T)() const
99     in (isValid, "Can't get value from empty token.")
100     {
101         return  data.get!T;
102     }
103 
104     @property dchar curChar() pure const
105     in (type == TokenType.empty, "Invalid token type")
106     {
107         return _chr;
108     }
109 
110     @property bool isValid() pure const
111     {
112         return type != TokenType.empty;
113     }
114 
115     bool isOf(TokenType type, Tag value) const
116     {
117         return type == type &&  tag == value;
118     }
119 
120     bool isId() pure const
121     {
122         return type == TokenType.id ;
123     }
124 
125     bool hasChr(dchar c) pure const
126     {
127         return isEmpty && _chr == c;
128     }
129 
130     @property bool isEmpty() pure const
131     {
132         return type == TokenType.empty;
133     }
134 
135     @property bool isSpace() pure const
136     {
137         return !isNewLine && _chr.isWhite;
138     }
139 
140     @property bool isNewLine() pure const
141     {
142         return isEmpty && _chr.isWhite && _chr.isControl;
143     }
144 
145     @property bool isAlpha() pure const
146     {
147         return isEmpty && _chr.isAlpha;
148     }
149 
150     @property bool isAlphaNum() pure const
151     {
152         return isEmpty && _chr.isAlphaNum;
153     }
154 
155     @property bool isUpper() pure const
156     {
157         return isEmpty && _chr.isUpper;
158     }
159 
160     bool isScalar() pure const
161     {
162         return type >= TokenType.null_
163             && type <= TokenType.xstr;
164     }
165 
166     bool opEquals()(auto ref const(Token) tk) const
167     {
168         // optimize non-value cases
169         if (type == TokenType.empty)
170             return tk.type == TokenType.empty;
171         if (type == TokenType.null_)
172             return tk.type == TokenType.null_;
173         if (type == TokenType.marker)
174             return tk.type == TokenType.marker;
175         if (type == TokenType.remove)
176             return tk.type == TokenType.remove;
177         if (type == TokenType.na)
178             return tk.type == TokenType.na;
179 
180         return type == tk.type && data == tk.data;
181     }
182 
183 private:
184     TokenType _type = TokenType.empty;
185     Tag data;
186     dchar _chr;
187 }
188 
189 /**
190 Lexes Zinc tokens from some char $(D InputRange)
191 */
192 struct ZincLexer(Range) 
193 if (isCharInputRange!Range)
194 {
195     this(Range r, int ver = 3)
196     {
197         this.input  = LookAhead!Range(r);
198         this.ver    = ver;
199         if (r.empty)
200             isEmpty = true;
201         else
202             popFront();
203     }
204 
205     @property bool empty() pure nothrow
206     {
207         return isEmpty;
208     }
209 
210     @property ref const(Token) front() pure nothrow
211     {
212         return crtToken;
213     }
214 
215     @property char cur()
216     {
217         return input.front;
218     }
219 
220     void popFront()
221     {
222         if (input.hasStash)
223             input.clearStash();
224 
225         if (input.empty)
226         {
227             isEmpty = true;
228             return;
229         }
230         
231         TokenType nextToken;
232         char startChr = cur;
233 
234     loop:
235         while (!input.empty)
236         {
237             // short circuit special chars
238             if (startChr.isWhite || startChr.isControl)
239             {
240                 if (startChr == '\r') // normalize CR-LF
241                 {
242                     input.popFront();
243                     startChr = cur;
244                     continue loop;
245                 }
246                 crtToken = Token.makeChar(startChr);
247                 return input.popFront();
248             }
249 
250             switch (nextToken)
251             {
252                 case TokenType.id:
253                     if (lexId())
254                         break loop;
255                     nextToken = TokenType.null_;
256                     continue loop;
257 
258                 case TokenType.null_:
259                     if (lexNull())
260                         break loop;
261                     nextToken = TokenType.marker;
262                     continue loop;
263                 
264                 case TokenType.marker:
265                     if (lexMarker())
266                         break loop;
267                     nextToken = TokenType.remove;
268                     continue loop;
269 
270                 case TokenType.remove:
271                     if (lexRemove())
272                         break loop;
273                     nextToken = TokenType.na;
274                     continue loop;
275 
276                 case TokenType.na:
277                     if (lexNa())
278                         break loop;
279                     nextToken = TokenType.bool_;
280                     continue loop;
281 
282                 case TokenType.bool_:
283                     if (lexBool())
284                         break loop;
285                     nextToken = TokenType.ref_;
286                     continue loop;
287 
288                 case TokenType.ref_:
289                     if (lexRef())
290                         break loop;
291                     nextToken = TokenType.str;
292                     continue loop;
293 
294                 case TokenType.str:
295                     if (lexStr())
296                         break loop;
297                     nextToken = TokenType.uri;
298                     continue loop;
299 
300                 case TokenType.uri:
301                     if (lexUri())
302                         break loop;
303                     nextToken = TokenType.number;
304                     continue loop;
305 
306                 case TokenType.number:
307                     if (lexNumber())
308                     {
309                         if (input.crtStash.length <= 4) // verify if this isn't a date
310                         {
311                             if (lexDateTime() || lexTime())
312                                 break loop;
313                         }
314                         break loop;
315                     }
316                     else
317                     {
318                         nextToken = TokenType.dateTime;
319                         continue loop;
320                     }
321 
322                 case TokenType.dateTime: // the date part can be parsed here, so try both
323                     if (lexDateTime())
324                         break loop;
325                     nextToken = TokenType.time;
326                     continue loop;
327                     
328                 case TokenType.time:
329                     if (lexTime())
330                         break loop;
331                     nextToken = TokenType.coord;
332                     continue loop;
333 
334                 case TokenType.coord:
335                     if (lexCoord())
336                         break loop;
337                     nextToken = TokenType.xstr;
338                     continue loop;
339 
340                 case TokenType.xstr:
341                     if (ver < 3 && lexBin())
342                         break loop;
343                     else if (lexXStr())
344                         break loop;
345                      goto default;
346 
347                 default:
348                     crtToken = Token.makeChar(startChr);
349                     if (input.hasStash)
350                         return input.save();
351                     if (!input.empty)
352                         input.popFront();
353                     break loop;
354             }
355         }
356     }
357 
358     @property ref Range range() scope return
359     {
360         return input.range;
361     }
362     
363     @property void range(scope ref Range r)
364     {
365         input.range     = r;
366         crtToken        = Token.makeChar(r.front);
367     }
368 
369     // zinc spec version
370     int ver = 3;
371 
372     // internals
373 package(haystack):
374     
375     @disable this();
376     @disable this(this);
377 
378     @property ref buffer() scope return
379     {
380         return input;
381     }
382 
383     bool isEmpty = false;
384 
385     bool lexId()
386     {
387         enum State { fistChar, restChars }
388     loop:
389         for (State crtState; !input.empty; input.popFront())
390         {
391             final switch (crtState)
392             {
393                 case State.fistChar: // required to start with lower case alpha
394                     if (!cur.isLower)
395                         return false;
396                     input.stash();
397                     crtState++;
398                     continue;
399 
400                 case State.restChars:
401                     if (isXStrChar)
402                         input.stash();
403                     else
404                         break loop;
405             }
406         }
407         crtToken = Token(TokenType.id, input.commitStash().tag);
408         return true;
409     }
410     unittest
411     {
412         // good
413         assertTokenValue("abcAbcD123_wwwe", Token(TokenType.id, "abcAbcD123_wwwe".tag));
414         assertTokenValue("idFoo@", Token(TokenType.id, "idFoo".tag));
415         assertTokenValue("idBar ", Token(TokenType.id, "idBar".tag));
416         assertTokenValue("someId,", Token(TokenType.id, "someId".tag));
417         // bad
418         assertTokenEmpty("BAD%Id");
419     }
420 
421     bool lexNull()
422     {
423         if (cur != 'N')
424             return false;
425         
426         // probe if this has more
427         input.stash();
428         input.popFront();
429         if (!input.empty && isXStrChar)
430         {
431             input.save(); // save look ahead
432             return false;
433         }
434 
435         crtToken = Token(TokenType.null_, Tag());
436         return true;
437     }
438     unittest
439     {
440         // good
441         assertTokenValue("N", Token(TokenType.null_));
442         assertTokenValue("N ", Token(TokenType.null_));
443         assertTokenValue("N,", Token(TokenType.null_));
444         // bad
445         assertTokenEmpty("X");
446         assertTokenEmpty("Nx");
447     }
448 
449     bool lexMarker()
450     {
451         if (cur != 'M')
452             return false;
453         
454         input.stash();
455         input.popFront();
456         if (!input.empty && isXStrChar)
457         {
458             input.save;
459             return false;
460         }
461         
462         crtToken = Token(TokenType.marker, marker());
463         return true;
464     }
465     unittest
466     {
467         // good
468         assertTokenValue("M", Token(TokenType.marker));
469         assertTokenValue("M ", Token(TokenType.marker));
470         assertTokenValue("M|", Token(TokenType.marker));
471         // bad
472         assertTokenEmpty("Y");
473     }
474 
475     bool lexRemove()
476     {
477         if (cur != 'R')
478             return false;
479         
480         input.stash();
481         input.popFront();
482         if (!input.empty && isXStrChar)
483         {
484             input.save;
485             return false;
486         }
487 
488         crtToken = Token(TokenType.remove, Tag.init);
489         return true;
490     }
491     unittest
492     {
493         // good
494         assertTokenValue("R", Token(TokenType.remove));
495         assertTokenValue("R ", Token(TokenType.remove));
496         assertTokenValue("R,", Token(TokenType.remove));
497         // bad
498         assertTokenEmpty("K");
499     }
500 
501     bool lexNa()
502     {
503         if (cur != 'N')
504             return false;
505         
506         input.stash();
507         input.popFront();
508         input.stash();
509         if (cur != 'A')
510         {
511             // more to lex
512             input.popFront();
513             input.save();
514             return false;
515         }
516 
517         if (!input.empty)
518         {
519             input.popFront();
520             if (!input.empty && isXStrChar)
521             {
522                 input.stash();
523                 input.save;
524                 return false;
525             }
526         }
527         crtToken = Token(TokenType.na, Na().Tag);
528         return true;
529     }
530     unittest
531     {   
532         // good
533         assertTokenValue("NA", Token(TokenType.na));
534         assertTokenValue("NA ", Token(TokenType.na));
535         assertTokenValue("NA,", Token(TokenType.na));
536         // bad
537         assertTokenEmpty("NAM,");
538         assertTokenEmpty("XY");
539     }
540 
541     bool lexBool()
542     {
543         if (cur != 'T' && cur != 'F')
544             return false;
545         
546         const val = (cur == 'T');
547             
548         input.stash();
549         input.popFront();
550         if (!input.empty && isXStrChar)
551         {
552             input.save;
553             return false;
554         }
555 
556         crtToken = Token(TokenType.bool_, val.tag);
557         return true;
558     }
559     unittest
560     {
561         // good
562         assertTokenValue("T", Token(TokenType.bool_, true.tag));
563         assertTokenValue("T\t", Token(TokenType.bool_, true.tag));
564         assertTokenValue("F", Token(TokenType.bool_, false.tag));
565         assertTokenValue("F,", Token(TokenType.bool_, false.tag));
566         // bad
567         assertTokenEmpty("K");
568     }
569 
570     bool lexRef()
571     {
572         if (cur != '@')
573             return false;
574 
575         string val;
576         string dis;
577 
578         for (input.popFront(); !input.empty; input.popFront())
579         {
580             if (cur.isAlphaNum
581                 || cur == '_' 
582                 || cur == ':' 
583                 || cur == '-' 
584                 || cur == '.' 
585                 || cur == '~')
586             {
587                 input.stash();
588             }
589             else if ((cur.isWhite && !cur.isControl) && input.hasStash)
590             {
591                 input.popFront(); // skip ws
592                 val = input.commitStash();
593                 if (lexStr())
594                 {
595                     dis = crtToken.value!Str;
596                     crtToken = Token();
597                 }
598                 else
599                 {
600                     input.save();
601                 }
602                 break;
603             }
604             else if (input.hasStash)
605             {
606                 break;
607             }
608             else
609             {
610                 return false;
611             }
612         }
613         if (val is null)
614         {
615             if (!input.hasStash)
616                 return false;
617             val = input.commitStash();
618         }
619         crtToken = Token(TokenType.ref_, Tag(Ref(val, dis)));
620         return true;
621     }
622     unittest
623     {
624         // good
625         assertTokenValue("@fooBar,", Token(TokenType.ref_, Ref("fooBar").Tag));
626         assertTokenValue(`@fooBar "a nice description"`, Token(TokenType.ref_, Ref("fooBar", "a nice description").Tag));
627         assertTokenValue(`@fooBar ,`, Token(TokenType.ref_, Ref("fooBar").Tag));
628         // bad
629         assertTokenEmpty("@");
630         assertTokenEmpty("&");
631         assertTokenEmpty("@#");
632     }
633 
634     string lexChars(immutable char[] esc, immutable char[] escVal, char delim = '"')
635     in (esc.length == escVal.length)
636     {
637         import std.format   : formattedRead;
638         import std.string   : indexOf;
639 
640         if (cur != delim)
641             return null;
642         
643         bool hasTerm = false;
644         for (input.popFront(); !input.empty; input.popFront())
645         {
646         loop:
647             if (cur == delim) // found terminator
648             {
649                 hasTerm = true;
650                 input.popFront();
651                 break;
652             }
653 
654             if (cur < ' ')
655                 return null;
656             
657             if (cur != '\\')
658             {
659                 input.stash();
660             }
661             else
662             {
663                 if (input.empty)
664                     return null;
665 
666                 input.popFront();
667                 if (cur == 'u')
668                 {
669                     if (input.empty)
670                         return null;
671                     input.popFront();
672                     if (input.empty || !cur.isHexDigit)
673                         return null;
674                     dchar unicodeChar; 
675                     int count = input.formattedRead("%x", &unicodeChar);
676                     if (!count)
677                         return null;
678                     input.stash(unicodeChar);
679                     // we consumed all u's chars, no need to popFront
680                     goto loop; 
681                 }
682                 ptrdiff_t escPos = esc.indexOf(cur);
683                 if (escPos != -1)
684                     input.stash(escVal[escPos]);
685                 else
686                     return null;
687             }
688         }
689         if (!hasTerm)
690             return null;
691         if (!input.hasStash)
692             return "";
693         return input.commitStash();
694     }
695 
696     bool lexStr()
697     {
698         enum delim                  = '"';
699         static immutable strEsc     = [ 'n', 'r', 't', '"', '\\', '$', 'b', 'f'];
700         static immutable strEscVal  = ['\n', '\r', '\t', '"', '\\', '$', '\b', '\f'];
701         
702         string chars = lexChars(strEsc, strEscVal, delim);
703         if (chars is null)
704             return false;
705         
706         crtToken = Token(TokenType.str, chars.tag);
707         return true;
708     }
709     unittest
710     {
711         // good
712         assertTokenValue(`"hello world"`, Token(TokenType.str, "hello world".tag));
713         assertTokenValue(`"a line\nsome\ttab"`, Token(TokenType.str, "a line\nsome\ttab".tag));
714         assertTokenValue(`""`, Token(TokenType.str, "".tag));
715         assertTokenValue(`"some unicode char: \u00E6"`, Token(TokenType.str, "some unicode char: æ".tag));
716         assertTokenValue(`"inline unicode char: 語"`, Token(TokenType.str, "inline unicode char: 語".tag));
717         // bad
718         assertTokenEmpty(`"`);
719         assertTokenEmpty(`"fooo`);
720         assertTokenEmpty(`"a bad \u"`);
721     }
722 
723     bool lexUri()
724     {
725         enum delim = '`';
726         static immutable uriEsc = [':', '/', '?', '#', '[', ']', '@', '`', '\\', '&', '=', ';'];
727         static immutable uriEscVal = [':', '/', '?', '#', '[', ']', '@', '`', '\\', '&', '=', ';'];
728         string chars = lexChars(uriEsc, uriEscVal, delim);
729         if (chars !is null)
730         {
731             crtToken = Token(TokenType.uri, cast(Tag) Uri(chars));
732             return true;
733         }
734         return false;
735     }
736     unittest
737     {
738         // good
739         assertTokenValue("`/a/b/c`", Token(TokenType.uri, cast(Tag) Uri("/a/b/c")));
740         // bad
741         assertTokenEmpty("`");
742     }
743 
744     bool lexNumber()
745     {
746         import std.math : isNaN;
747         enum State { integral, fractionalDigit, fractional, expSign, exp, unit }
748         
749         // test the optional sign
750         if (cur == '-')
751         {
752             input.stash();
753             input.popFront();
754             if (input.empty)
755                 return false;
756             if (!cur.isDigit && input.find("INF"))
757             {
758                 crtToken = Token(TokenType.number, tag(-1 * double.infinity));
759                 return true;
760             }
761         }
762         // lex number parts
763         if (cur.isDigit)
764         {
765             static void parseNum(const(char)[] chars, ref double val)
766             {
767                 import std.format : formattedRead;
768                 chars.formattedRead("%g", &val);
769             }
770 
771             bool isUnit()
772             {
773                 return cur.isAlpha || cur == '%' || cur == '_' || cur == '/' || cur == '$' || cur > 127;
774             }
775 
776             double value;
777             string unit;
778             State crtState = State.integral;
779             input.stash();
780         loop:
781             for (input.popFront(); !input.empty; input.popFront())
782             {
783                 final switch (crtState)
784                 {
785                     case State.integral:
786                         if (cur.isDigit)
787                         {
788                             input.stash();
789                         }
790                         else if (cur == '_')
791                         {
792                             continue;
793                         }
794                         else if (cur == '.')
795                         {
796                             input.stash();
797                             crtState = State.fractionalDigit;
798                         }
799                         else if (isUnit)
800                         {
801                             parseNum(input.crtStash, value);
802                             input.clearStash();
803                             input.stash();
804                             crtState = State.unit;
805                         }
806                         else
807                         {
808                             // save current scratch buffer and try matching for date or time
809                             if (input.crtStash.length == 4 && cur == '-'
810                                     || input.crtStash.length == 2 && cur == ':')
811                                 input.save();
812                             break loop;
813                         }
814                         break;
815 
816                     case State.fractionalDigit:
817                         if (cur.isDigit)
818                         {
819                             input.stash();
820                             crtState = State.fractional;
821                         }
822                         else
823                         {
824                             return false;
825                         }
826                         break;
827 
828                     case State.fractional:
829                         if (cur.isDigit)
830                         {
831                             input.stash();
832                         }
833                         else if (cur == '_')
834                         {
835                             continue;
836                         }
837                         else if (cur == 'e' || cur == 'E')
838                         {    
839                             input.stash();
840                             crtState = State.expSign;
841                         }
842                         else if (isUnit)
843                         {
844                             parseNum(input.crtStash, value);
845                             input.clearStash();
846                             input.stash();
847                             crtState = State.unit;
848                         }
849                         else
850                         {   
851                             // nothing to process
852                             break loop;
853                         }
854                         break;
855 
856                     case State.expSign:
857                         if (cur == '+' || cur == '-' || cur.isDigit)
858                         {
859                             input.stash();
860                             crtState = State.exp;
861                         }
862                         else
863                         {
864                             return false;
865                         }
866                         break;
867 
868                     case State.exp:
869                         if (cur.isDigit)
870                         {
871                             input.stash();
872                         }
873                         else if (isUnit)
874                         {
875                             parseNum(input.crtStash, value);
876                             input.clearStash();
877                             input.stash();
878                             crtState = State.unit;
879                         }
880                         else
881                         {
882                             // nothing to process
883                             break loop;
884                         }
885                         break;
886 
887                     case State.unit:
888                         if (isUnit)
889                         {
890                             input.stash();
891                         }
892                         else
893                         {
894                             // nothing to process
895                             break loop;
896                         }
897                         break;
898                 }
899             }
900 
901             if (value.isNaN)
902                 parseNum(input.crtStash, value);
903             else
904                 unit = input.commitStash();
905 
906             crtToken = Token(TokenType.number, Num(value, unit).Tag);
907             return true;
908         }
909         else if (input.find("INF"))
910         {
911             crtToken = Token(TokenType.number, tag(double.infinity));
912             return true;
913         }
914         else if (input.crtStash == "Na")
915         {
916             input.clearStash();
917             if (input.empty)
918                 return false;
919 
920             if (cur == 'N')
921             {
922                 input.popFront();
923                 crtToken = Token(TokenType.number, tag(double.nan));
924                 return true;
925             }
926         }
927         return false;
928     }
929     unittest
930     {
931         assertTokenValue("-INF", Token(TokenType.number, (-1 * double.infinity).tag));
932         assertTokenValue("INF", Token(TokenType.number, (double.infinity).tag));
933         assertTokenIsNan("NaN");
934         assertTokenIsNan("NaN,");
935         assertTokenValue("100", Token(TokenType.number, 100.tag));
936         assertTokenValue("-88", Token(TokenType.number, (-88).tag));
937         assertTokenValue("-99.", Token(TokenType.number, (-99.0).tag));
938         assertTokenValue("42.42", Token(TokenType.number, (42.42).tag));
939         assertTokenValue("9.6e+10", Token(TokenType.number, (96000000000).tag));
940         assertTokenValue("100%", Token(TokenType.number, Num(100, "%").Tag));
941         assertTokenValue("100$", Token(TokenType.number, Num(100, "$").Tag));
942         // bad
943         assertTokenEmpty("-");
944         assertTokenEmpty("Na");
945         assertTokenEmpty("IN");
946         assertTokenEmpty("_12");
947     }
948 
949     bool lexDate()
950     {
951         import std.conv : to;
952         enum State { year, month, day }
953 
954         State crtState;
955         int year, month, day;
956         int parts = 0;
957         
958         for (; !input.empty && parts < 8; input.popFront())
959         {
960             final switch (crtState)
961             {
962                 case State.year:
963                     if (cur.isDigit)
964                     {
965                         input.stash();
966                         if (++parts > 4) // to many digits
967                             return false;
968                     }
969                     else if (cur == '-' && parts == 4)
970                     {
971                         year = to!int(input.crtStash());
972                         input.clearStash();
973                         crtState++;
974                     }
975                     else if (parts > 0) // keep the stashed digits
976                     {
977                         input.save();
978                         return false;
979                     }
980                     else // no match
981                     {
982                         return false;
983                     }
984                     break;
985 
986                 case State.month:
987                     if (cur.isDigit)
988                     {
989                         input.stash();
990                         if (++parts > 6)
991                             return false;
992                     }
993                     else if (cur == '-' && parts == 6)
994                     {
995                         month = to!int(input.crtStash());
996                         input.clearStash();
997                         crtState++;
998                     }
999                     else
1000                     {
1001                         return false;
1002                     }
1003                     break;
1004 
1005                 case State.day:
1006                     if (!cur.isDigit)
1007                         return false;
1008                     
1009                     input.stash();
1010                     if (++parts == 8)
1011                     {
1012                         day = to!int(input.crtStash());
1013                         input.clearStash();
1014                     }
1015                     break;
1016             }
1017         }
1018         if (crtState < State.day)
1019             return false;
1020         crtToken = Token(TokenType.date, Date(year, month, day).Tag);
1021         return true;
1022     }
1023     unittest
1024     {
1025         // good
1026         assertTokenValue("2017-01-15", Token(TokenType.date, Date(2017, 01, 15).Tag));
1027         assertTokenValue("2900-12-31", Token(TokenType.date, Date(2900, 12, 31).Tag));
1028         assertTokenValue("2900-12-31234", Token(TokenType.date, Date(2900, 12, 31).Tag));
1029         // bad
1030         assertTokenValue("200017-111-22", Token(TokenType.number, 200017.tag));
1031         assertTokenValue("2-1-2", Token(TokenType.number, 2.tag));
1032         assertTokenValue("2017_1-2", Token(TokenType.number, 20171.tag));
1033     }
1034 
1035     bool lexTime()
1036     {
1037         import std.conv : to;
1038         enum State { hours, minutes, sec, dot, fraction }
1039         
1040         State crtState;
1041         int hours, minutes, sec, fraction;
1042         int parts = 0;
1043 
1044         if (input.empty)
1045             return false;
1046 
1047     loop:
1048         for (; !input.empty; input.popFront())
1049         {
1050             final switch (crtState)
1051             {
1052                 case State.hours:
1053                     if (cur.isDigit) // check the 2nd digit of the hours number
1054                     {
1055                         input.stash();
1056                         if (++parts > 2)
1057                             return false;
1058                     }
1059                     else if (cur == ':' && parts == 2) // got the 2 hour numbers and sep
1060                     {
1061                         hours = to!int(input.crtStash());
1062                         input.clearStash();
1063                         crtState++;
1064                     }
1065                     // no separator found
1066                     else
1067                     {
1068                         return false;
1069                     }
1070                     break;
1071 
1072                 case State.minutes:
1073                     if (cur.isDigit)
1074                     {
1075                         input.stash();
1076                         if (++parts > 4)
1077                             return false;
1078                     }
1079                     else if (cur == ':' && parts == 4)
1080                     {
1081                         minutes = to!int(input.crtStash());
1082                         input.clearStash();
1083                         crtState++;
1084                     }
1085                     else
1086                     {
1087                         return false; // no separator found
1088                     }
1089                     break;
1090 
1091                 case State.sec:
1092                     if (cur.isDigit)
1093                     {
1094                         input.stash();
1095                         parts++;
1096                     }
1097                     else
1098                     {
1099                         return false;
1100                     }
1101                     
1102                     if (parts == 6)
1103                     {
1104                         sec = to!int(input.crtStash());
1105                         input.clearStash();
1106                         crtState++;
1107                     }
1108                     break;
1109 
1110                 case State.dot:
1111                     if (cur != '.')
1112                         break loop;
1113                     crtState++;
1114                     break;
1115 
1116                 case State.fraction:
1117                     if (cur.isDigit)
1118                     {
1119                         input.stash();
1120                         parts++;
1121                     }
1122                     else
1123                     {
1124                         if (!input.hasStash)
1125                             return false;
1126                         break loop;
1127                     }
1128                     break;
1129             }
1130         }
1131         if (parts < 6)
1132             return false;
1133         if (crtState == State.fraction && input.hasStash)
1134         {
1135             fraction = to!int(input.crtStash());
1136             input.clearStash();
1137         }
1138         crtToken = Token(TokenType.date, Time(hours, minutes, sec, fraction).Tag);
1139         return true;
1140     }
1141     unittest
1142     {
1143         // good
1144         assertTokenValue("09:40:03", Token(TokenType.date, Time(9, 40, 3).Tag));
1145         assertTokenValue("23:59:59", Token(TokenType.date, Time(23, 59, 59).Tag));
1146         assertTokenValue("23:59:59.999", Token(TokenType.date, Time(23, 59, 59, 999).Tag));
1147         // bad
1148         assertTokenValue("7000:00", Token(TokenType.number, 7000.tag));
1149         assertTokenValue("8:00", Token(TokenType.number, 8.tag));
1150         assertTokenValue("05:12", Token(TokenType.number, 5.tag));
1151         assertTokenValue("23:", Token(TokenType.number, 23.tag));
1152     }
1153 
1154     // used for both Date and DateTime lexing
1155     bool lexDateTime()
1156     {
1157         import core.time    : msecs;
1158         import std.datetime : UTC;
1159         import haystack.util.tzdata : timeZone;
1160 
1161         if (!lexDate()) // try the date part
1162             return false;
1163         if (input.empty || cur != 'T') // got only the date part
1164             return true;
1165         
1166         Tag date = crtToken.data;
1167 
1168         input.clearStash(); // clear the date stash
1169         input.popFront(); // move next
1170         crtToken = Token();
1171         if (input.empty) // it must have more
1172             return false;
1173 
1174         if (!lexTime()) // get the time part
1175             return false;
1176             
1177         if (input.empty)
1178             return false;
1179             
1180         Tag time    = crtToken.data;
1181         input.clearStash(); // clear the time stash
1182         crtToken = Token();
1183 
1184         enum State {utc, hours, minutes, tz}
1185             
1186         int tkCount = 0;
1187         string offset;
1188     loop:
1189         for (State crtState; !input.empty; input.popFront())
1190         {
1191             final switch (crtState)
1192             {
1193                 case State.utc:
1194                     if (cur == 'Z') // end of utc date time
1195                     {
1196                         if (input.empty)
1197                             break loop; // done
1198                         crtState = State.tz;
1199                         continue; // parse the UTC tz name
1200                     }
1201                     else if (cur == '-' || cur == '+') // offset
1202                     {
1203                         input.stash();
1204                         crtState = State.hours;
1205                     }
1206                     else
1207                     {
1208                         return false;
1209                     }
1210                     break;
1211 
1212                 case State.hours:
1213                     if (cur.isDigit) // offset hours
1214                     {
1215                         input.stash();
1216                         tkCount++;
1217                         continue;
1218                     }
1219                     else if (tkCount < 2) // must be number
1220                     {
1221                         return false;
1222                     }
1223                     if (cur == ':' && tkCount == 2) // got 2 numbers and a separator
1224                     {
1225                         input.stash();
1226                         tkCount     = 0;
1227                         crtState    = State.minutes;
1228                         continue;
1229                     }
1230                     // no separator found
1231                     if (tkCount > 2)
1232                         return false;
1233                     break;
1234 
1235                 case State.minutes:
1236                     if (cur.isDigit) // minutes number
1237                     {
1238                         input.stash();
1239                         if (++tkCount == 2) // found the minutes number
1240                         {
1241                             offset      = input.commitStash();
1242                             tkCount     = 0;
1243                             crtState    = State.tz;
1244                         }
1245                         continue;
1246                     }
1247                     else if (tkCount < 2) // must be number
1248                     {
1249                         return false;
1250                     }
1251                     break;
1252 
1253                 case State.tz:
1254                     if (tkCount == 0)
1255                     {
1256                         if (cur == ' ')
1257                         {
1258                             tkCount++;
1259                             continue;
1260                         }
1261                         else
1262                         {
1263                             break loop;
1264                         }
1265                     }
1266                             
1267                     if (tkCount == 1) // ensure tz starts with an alpha
1268                     {
1269                         if (cur.isAlpha)
1270                         {
1271                             input.stash();
1272                             tkCount++;
1273                             continue;
1274                         }
1275                         else
1276                         {
1277                             return false; // invalid tz start
1278                         }
1279                     }
1280                     else if (cur.isAlpha 
1281                                 || cur == '/' 
1282                                 || cur == '_' 
1283                                 || cur == '-' 
1284                                 || cur == '+' ) // the rest of tz chars
1285                     {
1286                         input.stash();
1287                     }
1288                     else // found all
1289                     {
1290                         break loop;
1291                     }
1292 
1293                     break;
1294             }
1295         }
1296 
1297         string tzName = input.commitStash();
1298         DateTime dt = DateTime(date.get!Date, time.get!Time);
1299             
1300         if (tzName.empty || tzName == "UTC")
1301         {
1302             crtToken = Token(TokenType.dateTime, SysTime(dt, msecs((time.get!Time).millis), UTC()).Tag);
1303         }
1304         else
1305         {
1306             try
1307             {
1308                 auto tz = timeZone(tzName);
1309                 crtToken = Token(TokenType.dateTime, SysTime(dt, tz).Tag);
1310             }
1311             catch(Exception e)
1312             {
1313                 import std.conv         : to;
1314                 import std.algorithm    : filter;
1315                 import std.string       : indexOf;
1316                 immutable gmtTz = "Etc/GMT" ~ offset[0..offset.indexOf(':')].filter!(c => c != '0').to!string();
1317                 auto tz = timeZone(gmtTz);
1318                 crtToken = Token(TokenType.dateTime, SysTime(dt, tz).Tag);
1319             }
1320         }
1321         return true; // done        
1322     }
1323     unittest
1324     {
1325         import core.time : msecs;
1326         import std.datetime : TimeZone, UTC;
1327 
1328         // good
1329         assertTokenValue("2017-01-17T13:51:20Z", Token(TokenType.dateTime, SysTime(DateTime(2017, 1, 17, 13, 51, 20), UTC()).Tag));
1330         assertTokenValue("2009-11-09T15:39:00Z", Token(TokenType.dateTime, SysTime(DateTime(2009, 11, 9, 15, 39, 0), UTC()).Tag));
1331         assertTokenValue("1989-12-21T15:39:00Z UTC", Token(TokenType.dateTime, SysTime(DateTime(1989, 12, 21, 15, 39, 0), UTC()).Tag));
1332         assertTokenValue("2015-03-31T18:06:41.956Z", Token(TokenType.dateTime, SysTime(DateTime(2015, 3, 31, 18, 6, 41), msecs(956), UTC()).Tag));
1333         
1334         import haystack.util.tzdata;
1335         assertTokenValue("2010-08-31T08:45:00+02:00 Europe/Athens", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("Europe/Athens")).Tag));
1336         assertTokenValue("2010-08-31T08:45:00-05:00 New_York", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("New_York")).Tag));
1337         assertTokenValue("2010-08-31T08:45:00+02:00 Nicosia", Token(TokenType.dateTime, SysTime(DateTime(2010, 8, 31, 8, 45, 0), timeZone("Asia/Nicosia")).Tag));
1338         // bad
1339         assertTokenEmpty("2009-11-09T");
1340         assertTokenEmpty("2009-11-09T4");
1341     }
1342 
1343     bool lexCoord()
1344     {
1345         double lat, lng;
1346         enum State { coord, paran, lat, lng, done }
1347         State state;
1348 
1349         loop:
1350         for (; !input.empty; input.popFront())
1351         {
1352             final switch (state)
1353             {
1354                 case State.coord:
1355                     if (cur != 'C')
1356                         return false;
1357                     
1358                     input.stash();
1359                     state = State.paran;
1360                     continue;
1361 
1362                 case State.paran:
1363                     if (cur != '(')
1364                         return false;
1365                     
1366                     input.stash();
1367                     state = State.lat;
1368                     continue;
1369                    
1370                 case State.lat:
1371                     if (!cur.isDigit && cur != '-')
1372                         return false;
1373 
1374                     input.clearStash();
1375                     
1376                     if (!lexNumber())
1377                         return false;
1378                     if (cur != ',')
1379                         return false;
1380                     
1381                     lat = crtToken.data.get!Num;
1382                     input.clearStash();
1383                     state = State.lng;
1384                     continue;
1385 
1386                 case State.lng:
1387                     if (!lexNumber())
1388                         return false;
1389                     if (cur != ')')
1390                         return false;
1391                     lng = crtToken.data.get!Num;
1392                     state = State.done;
1393                     continue;
1394 
1395                 case State.done:
1396                     break loop;
1397             }
1398         }
1399         if (state != State.done)
1400             return false;
1401         crtToken = Token(TokenType.coord, Coord(lat, lng).Tag);
1402         return true;
1403     }
1404     unittest
1405     {
1406         // good
1407         assertTokenValue("C(37.545826,-77.449188), ", Token(TokenType.coord, Coord(37.545826,-77.449188).Tag));
1408         // bad
1409         assertTokenEmpty(`C`);
1410         assertTokenEmpty(`C()`);
1411         assertTokenEmpty(`C(42.3)`);
1412         assertTokenEmpty(`C(42.3,)`);
1413     }
1414 
1415     bool lexXStr()
1416     {
1417         enum State { firstChar, restChars, enc, done }
1418         
1419         State crtState;
1420         string type;
1421         string data;
1422 
1423         loop:
1424         for(; !input.empty; input.popFront())
1425         {
1426             final switch (crtState)
1427             {
1428                 case State.firstChar:
1429                     if (!cur.isUpper)
1430                         return false;
1431 
1432                     input.stash();
1433                     crtState    = State.restChars;
1434                     break;
1435 
1436                 case State.restChars:
1437                     if (cur.isAlphaNum || cur == '_')
1438                     {
1439                         input.stash();
1440                         continue;
1441                     }
1442 
1443                     if (cur != '(')
1444                         return false;
1445                     
1446                     type        = input.commitStash();
1447                     crtState    = State.enc;
1448                     break;
1449 
1450                 case State.enc:
1451                     if (!lexStr()) // consumes the string
1452                         return false;
1453                     data = crtToken.data.get!Str;
1454                     crtToken = Token();
1455                     // check next char
1456                     if (!input.empty && cur == ')')
1457                     {
1458                         crtState    = State.done;
1459                         continue;
1460                     }
1461                     else
1462                     {
1463                         return false;
1464                     }
1465                     
1466                 case State.done:
1467                     break loop;
1468             }
1469         }
1470         if (crtState != State.done)
1471             return false;
1472         crtToken = Token(TokenType.xstr, XStr(type, data).Tag);
1473         return true;
1474     }
1475     unittest
1476     {
1477         // good
1478         assertTokenValue(`FooBar("alabala")`, Token(TokenType.xstr, XStr("FooBar", "alabala").Tag));
1479         assertTokenValue(`Massive("\n")`, Token(TokenType.xstr, XStr("Massive", "\n").Tag));
1480         assertTokenValue(`Bin("mimeType"),`, Token(TokenType.xstr, XStr("Bin", "mimeType").Tag));
1481         // bad
1482         assertTokenEmpty(`Xx(")`);
1483         assertTokenEmpty(`Yx(""`);
1484     }
1485 
1486     // lexer for legacy Bin tag
1487     bool lexBin()
1488     {
1489         enum State { bin, mime, done }
1490         State state;
1491 
1492         loop:
1493         for(; !input.empty; input.popFront())
1494         {
1495             final switch (state)
1496             {
1497                 case State.bin:
1498                     if (!input.find("Bin(", true))
1499                        return false;
1500 
1501                     if (cur == '"') // found possible XStr
1502                     {
1503                         input.save();
1504                         return false;
1505                     }
1506                     input.clearStash();
1507                     input.stash();
1508                     state   = State.mime;
1509                     break;
1510 
1511                 case State.mime:
1512                     if (cur != ')')
1513                     {
1514                         input.stash();
1515                         continue;
1516                     }
1517                     state   = State.done;
1518                     break loop;
1519 
1520                 case State.done:
1521                     assert(false);
1522             }
1523         }
1524         if (state != State.done)
1525             return false;
1526         string data = input.commitStash();
1527         crtToken = Token(TokenType.xstr, XStr("Bin", data).Tag);
1528         return true;
1529     }
1530     unittest
1531     {
1532         assertTokenValue(`Bin(text/plain),`, Token(TokenType.xstr, XStr("Bin", "text/plain").Tag), 2);
1533         assertTokenEmpty(`Bad(text/plain),`, 2);
1534     }
1535 
1536 private:
1537 
1538     // test for a posible XStr part
1539     @property bool isXStrChar()
1540     {
1541         return cur.isAlphaNum || cur == '_';
1542     }
1543 
1544     // The current decoded token
1545     Token crtToken;
1546     // The look-ahead range
1547     LookAhead!Range input = void;
1548 }
1549 /// a string based lexer
1550 alias ZincStringLexer = ZincLexer!string;
1551 // bootstraps the rest of the lexer's unit tests
1552 unittest
1553 {
1554     auto l = ZincStringLexer("");
1555 }
1556 // test if provided string data decodes to the provided Token value
1557 private void assertTokenValue(string data, Token value, int ver = 3)
1558 {
1559     auto lex    = ZincStringLexer(data, ver);
1560     assert(lex.front() == value, "Failed expecting: " ~ value.tag.toStr ~ " got: " ~ lex.front().tag.toStr);
1561 }
1562 
1563 // test if provided string data decodes to the provided Token type
1564 private void assertTokenType(string data, TokenType value, int ver = 3)
1565 {
1566     auto lex    = ZincStringLexer(data, ver);
1567     assert(lex.front().type == value);
1568 }
1569 
1570 // test if provided string data decodes to the provided Token value
1571 private void assertTokenIsNan(string data, int ver = 3)
1572 {
1573     auto lex    = ZincStringLexer(data, ver);
1574     assert(lex.front().data.get!Num.isNaN);
1575 }
1576 // test if the provided string data can not be decoded
1577 private void assertTokenEmpty(string data, int ver = 3)
1578 {
1579     auto lex    = ZincStringLexer(data, ver);
1580     assert(lex.front().isEmpty);
1581 }
1582 
1583 package void dumpLexer(Lexer)(auto ref Lexer lex)
1584 {
1585     import std.algorithm    : move;
1586     import std.stdio        : writeln;
1587     import std.conv         : to;
1588     foreach (ref tk; lex.move())
1589     {
1590         writeln("Token type: ", tk.type, ", value: ", tk.type != TokenType.empty ? tk.tag.toStr() : "'" ~ to!string(tk.curChar) ~ "'");
1591     }
1592 }