lex.c

00001 /*
00002  * Copyright (c) 2011 Jiri Svoboda
00003  * All rights reserved.
00004  *
00005  * Redistribution and use in source and binary forms, with or without
00006  * modification, are permitted provided that the following conditions
00007  * are met:
00008  *
00009  * - Redistributions of source code must retain the above copyright
00010  *   notice, this list of conditions and the following disclaimer.
00011  * - Redistributions in binary form must reproduce the above copyright
00012  *   notice, this list of conditions and the following disclaimer in the
00013  *   documentation and/or other materials provided with the distribution.
00014  * - The name of the author may not be used to endorse or promote products
00015  *   derived from this software without specific prior written permission.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00018  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00019  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00020  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00021  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00022  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00023  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00024  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00025  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00026  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00027  */
00028 
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include "bigint.h"
00037 #include "cspan.h"
00038 #include "mytypes.h"
00039 #include "input.h"
00040 #include "os/os.h"
00041 #include "strtab.h"
00042 
00043 #include "lex.h"
00044 
00045 #define TAB_WIDTH 8
00046 
00047 typedef enum {
00048         cs_chr,
00049         cs_str
00050 } chr_str_t;
00051 
00052 static void lex_touch(lex_t *lex);
00053 static bool_t lex_read_try(lex_t *lex);
00054 
00055 static void lex_skip_comment(lex_t *lex);
00056 static void lex_skip_ws(lex_t *lex);
00057 static bool_t is_wstart(char c);
00058 static bool_t is_wcont(char c);
00059 static bool_t is_digit(char c);
00060 static void lex_word(lex_t *lex);
00061 static void lex_char(lex_t *lex);
00062 static void lex_number(lex_t *lex);
00063 static void lex_string(lex_t *lex);
00064 static void lex_char_string_core(lex_t *lex, chr_str_t cs);
00065 static int digit_value(char c);
00066 
00067 /* Note: This imposes an implementation limit on identifier length. */
00068 #define IBUF_SIZE 128
00069 static char ident_buf[IBUF_SIZE + 1];
00070 
00071 /* XXX This imposes an implementation limit on string literal length. */
00072 #define SLBUF_SIZE 128
00073 static char strlit_buf[SLBUF_SIZE + 1];
00074 
00076 struct lc_name {
00077         lclass_t lclass;
00078         const char *name;
00079 };
00080 
00082 static struct lc_name keywords[] = {
00083         { lc_and,       "and" },
00084         { lc_as,        "as" },
00085         { lc_bool,      "bool" },
00086         { lc_break,     "break" },
00087         { lc_builtin,   "builtin" },
00088         { lc_char,      "char" },
00089         { lc_class,     "class" },
00090         { lc_deleg,     "deleg" },
00091         { lc_do,        "do" },
00092         { lc_elif,      "elif" },
00093         { lc_else,      "else" },
00094         { lc_end,       "end" },
00095         { lc_enum,      "enum" },
00096         { lc_except,    "except" },
00097         { lc_false,     "false" },
00098         { lc_finally,   "finally" },
00099         { lc_for,       "for" },
00100         { lc_fun,       "fun" },
00101         { lc_get,       "get" },
00102         { lc_if,        "if" },
00103         { lc_in,        "in" },
00104         { lc_int,       "int" },
00105         { lc_interface, "interface" },
00106         { lc_is,        "is" },
00107         { lc_new,       "new" },
00108         { lc_not,       "not" },
00109         { lc_nil,       "nil" },
00110         { lc_or,        "or" },
00111         { lc_override,  "override" },
00112         { lc_packed,    "packed" },
00113         { lc_private,   "private" },
00114         { lc_prop,      "prop" },
00115         { lc_protected, "protected" },
00116         { lc_public,    "public" },
00117         { lc_raise,     "raise" },
00118         { lc_resource,  "resource" },
00119         { lc_return,    "return" },
00120         { lc_self,      "self" },
00121         { lc_set,       "set" },
00122         { lc_static,    "static" },
00123         { lc_string,    "string" },
00124         { lc_struct,    "struct" },
00125         { lc_switch,    "switch" },
00126         { lc_then,      "then" },
00127         { lc_this,      "this" },
00128         { lc_true,      "true" },
00129         { lc_var,       "var" },
00130         { lc_with,      "with" },
00131         { lc_when,      "when" },
00132         { lc_while,     "while" },
00133         { lc_yield,     "yield" },
00134 
00135         { 0,            NULL }
00136 };
00137 
00139 static struct lc_name simple_lc[] = {
00140         { lc_invalid,   "INVALID" },
00141         { lc_eof,       "EOF" },
00142 
00143         /* Operators */
00144         { lc_period,    "." },
00145         { lc_slash,     "/" },
00146         { lc_lparen,    "(" },
00147         { lc_rparen,    ")" },
00148         { lc_lsbr,      "[" },
00149         { lc_rsbr,      "]" },
00150         { lc_equal,     "==" },
00151         { lc_notequal,  "!=" },
00152         { lc_lt,        "<" },
00153         { lc_gt,        ">" },
00154         { lc_lt_equal,  "<=" },
00155         { lc_gt_equal,  ">=" },
00156         { lc_assign,    "=" },
00157         { lc_plus,      "+" },
00158         { lc_minus,     "-" },
00159         { lc_mult,      "*" },
00160         { lc_increase,  "+=" },
00161 
00162         /* Punctuators */
00163         { lc_comma,     "," },
00164         { lc_colon,     ":" },
00165         { lc_scolon,    ";" },
00166 
00167         { 0,            NULL },
00168 };
00169 
00177 void lclass_print(lclass_t lclass)
00178 {
00179         struct lc_name *dp;
00180 
00181         dp = keywords;
00182         while (dp->name != NULL) {
00183                 if (dp->lclass == lclass) {
00184                         printf("%s", dp->name);
00185                         return;
00186                 }
00187                 ++dp;
00188         }
00189 
00190         dp = simple_lc;
00191         while (dp->name != NULL) {
00192                 if (dp->lclass == lclass) {
00193                         printf("%s", dp->name);
00194                         return;
00195                 }
00196                 ++dp;
00197         }
00198 
00199         switch (lclass) {
00200         case lc_ident:
00201                 printf("ident");
00202                 break;
00203         case lc_lit_int:
00204                 printf("int_literal");
00205                 break;
00206         case lc_lit_string:
00207                 printf("string_literal");
00208                 break;
00209         default:
00210                 printf("<unknown?>");
00211                 break;
00212         }
00213 }
00214 
00221 void lem_print(lem_t *lem)
00222 {
00223         lclass_print(lem->lclass);
00224 
00225         switch (lem->lclass) {
00226         case lc_ident:
00227                 printf("('%s')", strtab_get_str(lem->u.ident.sid));
00228                 break;
00229         case lc_lit_int:
00230                 printf("(");
00231                 bigint_print(&lem->u.lit_int.value);
00232                 printf(")");
00233                 break;
00234         case lc_lit_string:
00235                 printf("(\"%s\")", lem->u.lit_string.value);
00236         default:
00237                 break;
00238         }
00239 }
00240 
00247 void lem_print_coords(lem_t *lem)
00248 {
00249         cspan_print(lem->cspan);
00250 }
00251 
00257 void lex_init(lex_t *lex, struct input *input)
00258 {
00259         int rc;
00260 
00261         lex->input = input;
00262 
00263         rc = input_get_line(lex->input, &lex->inbuf);
00264         if (rc != EOK) {
00265                 printf("Error reading input.\n");
00266                 exit(1);
00267         }
00268 
00269         lex->ibp = lex->inbuf;
00270         lex->col_adj = 0;
00271         lex->prev_valid = b_false;
00272         lex->current_valid = b_true;
00273 }
00274 
00281 void lex_next(lex_t *lex)
00282 {
00283         /* Make sure the current lem has already been read in. */
00284         lex_touch(lex);
00285 
00286         /* Force a new lem to be read on next access. */
00287         lex->current_valid = b_false;
00288 }
00289 
00298 lem_t *lex_get_current(lex_t *lex)
00299 {
00300         lex_touch(lex);
00301         return &lex->current;
00302 }
00303 
00312 lem_t *lex_peek_prev(lex_t *lex)
00313 {
00314         if (lex->current_valid == b_false) {
00315                 /*
00316                  * This means the head is advanced but next lem was not read.
00317                  * Thus the previous lem is still in @a current.
00318                  */
00319                 return &lex->current;
00320         }
00321 
00322         if (lex->prev_valid != b_true) {
00323                 /* Looks like we are still at the first lem. */
00324                 return NULL;
00325         }
00326 
00327         /*
00328          * Current lem has been read in. Thus the previous lem was moved to
00329          * @a previous.
00330          */
00331         return &lex->prev;
00332 }
00333 
00338 static void lex_touch(lex_t *lex)
00339 {
00340         bool_t got_lem;
00341 
00342         if (lex->current_valid == b_true)
00343                 return;
00344 
00345         /* Copy previous lem */
00346         lex->prev = lex->current;
00347         lex->prev_valid = b_true;
00348 
00349         do {
00350                 got_lem = lex_read_try(lex);
00351         } while (got_lem == b_false);
00352 
00353         lex->current_valid = b_true;
00354 }
00355 
00367 static bool_t lex_read_try(lex_t *lex)
00368 {
00369         char *bp, *lsp;
00370         int line0, col0;
00371 
00372         lex_skip_ws(lex);
00373 
00374         /*
00375          * Record lem coordinates. Line number we already have. For column
00376          * number we start with position in the input buffer. This works
00377          * for all characters except tab. Thus we keep track of tabs
00378          * separately using col_adj.
00379          */
00380         line0 = input_get_line_no(lex->input);
00381         col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
00382 
00383         lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
00384 
00385         lsp = lex->ibp;
00386         bp = lex->ibp;
00387 
00388         if (bp[0] == '\0') {
00389                 /* End of input */
00390                 lex->current.lclass = lc_eof;
00391                 goto finish;
00392         }
00393 
00394         if (is_wstart(bp[0])) {
00395                 lex_word(lex);
00396                 goto finish;
00397         }
00398 
00399         if (bp[0] == '\'') {
00400                 lex_char(lex);
00401                 goto finish;
00402         }
00403 
00404         if (is_digit(bp[0])) {
00405                 lex_number(lex);
00406                 goto finish;
00407         }
00408 
00409         if (bp[0] == '"') {
00410                 lex_string(lex);
00411                 goto finish;
00412         }
00413 
00414         if (bp[0] == '-' && bp[1] == '-') {
00415                 lex_skip_comment(lex);
00416 
00417                 /* Compute ending column number */
00418                 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
00419 
00420                 /* Try again */
00421                 return b_false;
00422         }
00423 
00424         switch (bp[0]) {
00425         case ',': lex->current.lclass = lc_comma; ++bp; break;
00426         case ':': lex->current.lclass = lc_colon; ++bp; break;
00427         case ';': lex->current.lclass = lc_scolon; ++bp; break;
00428 
00429         case '.': lex->current.lclass = lc_period; ++bp; break;
00430         case '/': lex->current.lclass = lc_slash; ++bp; break;
00431         case '(': lex->current.lclass = lc_lparen; ++bp; break;
00432         case ')': lex->current.lclass = lc_rparen; ++bp; break;
00433         case '[': lex->current.lclass = lc_lsbr; ++bp; break;
00434         case ']': lex->current.lclass = lc_rsbr; ++bp; break;
00435 
00436         case '=':
00437                 if (bp[1] == '=') {
00438                         lex->current.lclass = lc_equal; bp += 2; break;
00439                 }
00440                 lex->current.lclass = lc_assign; ++bp; break;
00441 
00442         case '!':
00443                 if (bp[1] == '=') {
00444                         lex->current.lclass = lc_notequal; bp += 2; break;
00445                 }
00446                 goto invalid;
00447 
00448         case '+':
00449                 if (bp[1] == '=') {
00450                         lex->current.lclass = lc_increase; bp += 2; break;
00451                 }
00452                 lex->current.lclass = lc_plus; ++bp; break;
00453 
00454         case '-':
00455                 lex->current.lclass = lc_minus; ++bp; break;
00456 
00457         case '*':
00458                 lex->current.lclass = lc_mult; ++bp; break;
00459 
00460         case '<':
00461                 if (bp[1] == '=') {
00462                         lex->current.lclass = lc_lt_equal; bp += 2; break;
00463                 }
00464                 lex->current.lclass = lc_lt; ++bp; break;
00465 
00466         case '>':
00467                 if (bp[1] == '=') {
00468                         lex->current.lclass = lc_gt_equal; bp += 2; break;
00469                 }
00470                 lex->current.lclass = lc_gt; ++bp; break;
00471 
00472         default:
00473                 goto invalid;
00474         }
00475 
00476         lex->ibp = bp;
00477 
00478 finish:
00479         /* Compute ending column number */
00480         lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
00481         return b_true;
00482 
00483 invalid:
00484         lex->current.lclass = lc_invalid;
00485         ++bp;
00486         lex->ibp = bp;
00487 
00488         return b_true;
00489 }
00490 
00498 static void lex_word(lex_t *lex)
00499 {
00500         struct lc_name *dp;
00501         char *bp;
00502         int idx;
00503 
00504         bp = lex->ibp;
00505         ident_buf[0] = bp[0];
00506         idx = 1;
00507 
00508         while (is_wcont(bp[idx])) {
00509                 if (idx >= IBUF_SIZE) {
00510                         printf("Error: Identifier too long.\n");
00511                         exit(1);
00512                 }
00513 
00514                 ident_buf[idx] = bp[idx];
00515                 ++idx;
00516         }
00517 
00518         lex->ibp = bp + idx;
00519 
00520         ident_buf[idx] = '\0';
00521 
00522         dp = keywords;
00523         while (dp->name != NULL) {
00524                 if (os_str_cmp(ident_buf, dp->name) == 0) {
00525                         /* Match */
00526                         lex->current.lclass = dp->lclass;
00527                         return;
00528                 }
00529                 ++dp;
00530         }
00531 
00532         /* No matching keyword -- it must be an identifier. */
00533         lex->current.lclass = lc_ident;
00534         lex->current.u.ident.sid = strtab_get_sid(ident_buf);
00535 }
00536 
00543 static void lex_char(lex_t *lex)
00544 {
00545         size_t len;
00546         int char_val;
00547 
00548         lex_char_string_core(lex, cs_chr);
00549 
00550         len = os_str_length(strlit_buf);
00551         if (len != 1) {
00552                 printf("Character literal should contain one character, "
00553                     "but contains %u characters instead.\n", (unsigned) len);
00554                 exit(1);
00555         }
00556 
00557         os_str_get_char(strlit_buf, 0, &char_val);
00558         lex->current.lclass = lc_lit_char;
00559         bigint_init(&lex->current.u.lit_char.value, char_val);
00560 }
00561 
00568 static void lex_number(lex_t *lex)
00569 {
00570         char *bp;
00571         bigint_t value;
00572         bigint_t dgval;
00573         bigint_t base;
00574         bigint_t tprod;
00575 
00576         bp = lex->ibp;
00577 
00578         bigint_init(&value, 0);
00579         bigint_init(&base, 10);
00580 
00581         while (is_digit(*bp)) {
00582                 bigint_mul(&value, &base, &tprod);
00583                 bigint_init(&dgval, digit_value(*bp));
00584 
00585                 bigint_destroy(&value);
00586                 bigint_add(&tprod, &dgval, &value);
00587                 bigint_destroy(&tprod);
00588                 bigint_destroy(&dgval);
00589 
00590                 ++bp;
00591         }
00592 
00593         bigint_destroy(&base);
00594 
00595         lex->ibp = bp;
00596 
00597         lex->current.lclass = lc_lit_int;
00598         bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
00599 }
00600 
00607 static void lex_string(lex_t *lex)
00608 {
00609         lex_char_string_core(lex, cs_str);
00610 
00611         lex->current.lclass = lc_lit_string;
00612         lex->current.u.lit_string.value = os_str_dup(strlit_buf);
00613 }
00614 
00615 static void lex_char_string_core(lex_t *lex, chr_str_t cs)
00616 {
00617         char *bp;
00618         int sidx, didx;
00619         char term;
00620         const char *descr, *cap_descr;
00621         char spchar;
00622 
00623         /* Make compiler happy */
00624         term = '\0';
00625         descr = NULL;
00626         cap_descr = NULL;
00627 
00628         switch (cs) {
00629         case cs_chr:
00630                 term = '\'';
00631                 descr = "character";
00632                 cap_descr = "Character";
00633                 break;
00634         case cs_str:
00635                 term = '"';
00636                 descr = "string";
00637                 cap_descr = "String";
00638                 break;
00639         }
00640 
00641         bp = lex->ibp + 1;
00642         sidx = didx = 0;
00643 
00644         while (bp[sidx] != term) {
00645                 if (didx >= SLBUF_SIZE) {
00646                         printf("Error: %s literal too long.\n", cap_descr);
00647                         exit(1);
00648                 }
00649 
00650                 if (bp[sidx] == '\0') {
00651                         printf("Error: Unterminated %s literal.\n", descr);
00652                         exit(1);
00653                 }
00654 
00655                 if (bp[sidx] == '\\') {
00656                         switch (bp[sidx + 1]) {
00657                         case '\\':
00658                                 spchar = '\\';
00659                                 break;
00660                         case '\'':
00661                                 spchar = '\'';
00662                                 break;
00663                         case '"':
00664                                 spchar = '"';
00665                                 break;
00666                         case 'n':
00667                                 spchar = '\n';
00668                                 break;
00669                         case 't':
00670                                 spchar = '\t';
00671                                 break;
00672                         default:
00673                                 printf("Error: Unknown character escape sequence.\n");
00674                                 exit(1);
00675                         }
00676 
00677                         strlit_buf[didx] = spchar;
00678                         ++didx;
00679                         sidx += 2;
00680                 } else {
00681                         strlit_buf[didx] = bp[sidx];
00682                         ++sidx; ++didx;
00683                 }
00684         }
00685 
00686         lex->ibp = bp + sidx + 1;
00687 
00688         strlit_buf[didx] = '\0';
00689 }
00690 
00697 static void lex_skip_comment(lex_t *lex)
00698 {
00699         char *bp;
00700 
00701         bp = lex->ibp + 2;
00702 
00703         while (*bp != '\n' && *bp != '\0') {
00704                 ++bp;
00705         }
00706 
00707         lex->ibp = bp;
00708 }
00709 
00716 static void lex_skip_ws(lex_t *lex)
00717 {
00718         char *bp;
00719         int rc;
00720 
00721         bp = lex->ibp;
00722 
00723         while (b_true) {
00724                 while (*bp == ' ' || *bp == '\t') {
00725                         if (*bp == '\t') {
00726                                 /* XXX This is too simplifed. */
00727                                 lex->col_adj += (TAB_WIDTH - 1);
00728                         }
00729                         ++bp;
00730                 }
00731 
00732                 if (*bp != '\n')
00733                         break;
00734 
00735                 /* Read next line */
00736                 rc = input_get_line(lex->input, &lex->inbuf);
00737                 if (rc != EOK) {
00738                         printf("Error reading input.\n");
00739                         exit(1);
00740                 }
00741 
00742                 bp = lex->inbuf;
00743                 lex->col_adj = 0;
00744         }
00745 
00746         lex->ibp = bp;
00747 }
00748 
00754 static bool_t is_wstart(char c)
00755 {
00756         return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
00757             (c == '_');
00758 }
00759 
00766 static bool_t is_wcont(char c)
00767 {
00768         return is_digit(c) || is_wstart(c);
00769 }
00770 
00776 static bool_t is_digit(char c)
00777 {
00778         return ((c >= '0') && (c <= '9'));
00779 }
00780 
00786 static int digit_value(char c)
00787 {
00788         return (c - '0');
00789 }

Generated on Thu Jun 2 07:45:42 2011 for HelenOS/USB by  doxygen 1.4.7