00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include "bigint.h"
00037 #include "cspan.h"
00038 #include "mytypes.h"
00039 #include "input.h"
00040 #include "os/os.h"
00041 #include "strtab.h"
00042
00043 #include "lex.h"
00044
00045 #define TAB_WIDTH 8
00046
00047 typedef enum {
00048 cs_chr,
00049 cs_str
00050 } chr_str_t;
00051
00052 static void lex_touch(lex_t *lex);
00053 static bool_t lex_read_try(lex_t *lex);
00054
00055 static void lex_skip_comment(lex_t *lex);
00056 static void lex_skip_ws(lex_t *lex);
00057 static bool_t is_wstart(char c);
00058 static bool_t is_wcont(char c);
00059 static bool_t is_digit(char c);
00060 static void lex_word(lex_t *lex);
00061 static void lex_char(lex_t *lex);
00062 static void lex_number(lex_t *lex);
00063 static void lex_string(lex_t *lex);
00064 static void lex_char_string_core(lex_t *lex, chr_str_t cs);
00065 static int digit_value(char c);
00066
00067
00068 #define IBUF_SIZE 128
00069 static char ident_buf[IBUF_SIZE + 1];
00070
00071
00072 #define SLBUF_SIZE 128
00073 static char strlit_buf[SLBUF_SIZE + 1];
00074
00076 struct lc_name {
00077 lclass_t lclass;
00078 const char *name;
00079 };
00080
00082 static struct lc_name keywords[] = {
00083 { lc_and, "and" },
00084 { lc_as, "as" },
00085 { lc_bool, "bool" },
00086 { lc_break, "break" },
00087 { lc_builtin, "builtin" },
00088 { lc_char, "char" },
00089 { lc_class, "class" },
00090 { lc_deleg, "deleg" },
00091 { lc_do, "do" },
00092 { lc_elif, "elif" },
00093 { lc_else, "else" },
00094 { lc_end, "end" },
00095 { lc_enum, "enum" },
00096 { lc_except, "except" },
00097 { lc_false, "false" },
00098 { lc_finally, "finally" },
00099 { lc_for, "for" },
00100 { lc_fun, "fun" },
00101 { lc_get, "get" },
00102 { lc_if, "if" },
00103 { lc_in, "in" },
00104 { lc_int, "int" },
00105 { lc_interface, "interface" },
00106 { lc_is, "is" },
00107 { lc_new, "new" },
00108 { lc_not, "not" },
00109 { lc_nil, "nil" },
00110 { lc_or, "or" },
00111 { lc_override, "override" },
00112 { lc_packed, "packed" },
00113 { lc_private, "private" },
00114 { lc_prop, "prop" },
00115 { lc_protected, "protected" },
00116 { lc_public, "public" },
00117 { lc_raise, "raise" },
00118 { lc_resource, "resource" },
00119 { lc_return, "return" },
00120 { lc_self, "self" },
00121 { lc_set, "set" },
00122 { lc_static, "static" },
00123 { lc_string, "string" },
00124 { lc_struct, "struct" },
00125 { lc_switch, "switch" },
00126 { lc_then, "then" },
00127 { lc_this, "this" },
00128 { lc_true, "true" },
00129 { lc_var, "var" },
00130 { lc_with, "with" },
00131 { lc_when, "when" },
00132 { lc_while, "while" },
00133 { lc_yield, "yield" },
00134
00135 { 0, NULL }
00136 };
00137
00139 static struct lc_name simple_lc[] = {
00140 { lc_invalid, "INVALID" },
00141 { lc_eof, "EOF" },
00142
00143
00144 { lc_period, "." },
00145 { lc_slash, "/" },
00146 { lc_lparen, "(" },
00147 { lc_rparen, ")" },
00148 { lc_lsbr, "[" },
00149 { lc_rsbr, "]" },
00150 { lc_equal, "==" },
00151 { lc_notequal, "!=" },
00152 { lc_lt, "<" },
00153 { lc_gt, ">" },
00154 { lc_lt_equal, "<=" },
00155 { lc_gt_equal, ">=" },
00156 { lc_assign, "=" },
00157 { lc_plus, "+" },
00158 { lc_minus, "-" },
00159 { lc_mult, "*" },
00160 { lc_increase, "+=" },
00161
00162
00163 { lc_comma, "," },
00164 { lc_colon, ":" },
00165 { lc_scolon, ";" },
00166
00167 { 0, NULL },
00168 };
00169
00177 void lclass_print(lclass_t lclass)
00178 {
00179 struct lc_name *dp;
00180
00181 dp = keywords;
00182 while (dp->name != NULL) {
00183 if (dp->lclass == lclass) {
00184 printf("%s", dp->name);
00185 return;
00186 }
00187 ++dp;
00188 }
00189
00190 dp = simple_lc;
00191 while (dp->name != NULL) {
00192 if (dp->lclass == lclass) {
00193 printf("%s", dp->name);
00194 return;
00195 }
00196 ++dp;
00197 }
00198
00199 switch (lclass) {
00200 case lc_ident:
00201 printf("ident");
00202 break;
00203 case lc_lit_int:
00204 printf("int_literal");
00205 break;
00206 case lc_lit_string:
00207 printf("string_literal");
00208 break;
00209 default:
00210 printf("<unknown?>");
00211 break;
00212 }
00213 }
00214
00221 void lem_print(lem_t *lem)
00222 {
00223 lclass_print(lem->lclass);
00224
00225 switch (lem->lclass) {
00226 case lc_ident:
00227 printf("('%s')", strtab_get_str(lem->u.ident.sid));
00228 break;
00229 case lc_lit_int:
00230 printf("(");
00231 bigint_print(&lem->u.lit_int.value);
00232 printf(")");
00233 break;
00234 case lc_lit_string:
00235 printf("(\"%s\")", lem->u.lit_string.value);
00236 default:
00237 break;
00238 }
00239 }
00240
00247 void lem_print_coords(lem_t *lem)
00248 {
00249 cspan_print(lem->cspan);
00250 }
00251
00257 void lex_init(lex_t *lex, struct input *input)
00258 {
00259 int rc;
00260
00261 lex->input = input;
00262
00263 rc = input_get_line(lex->input, &lex->inbuf);
00264 if (rc != EOK) {
00265 printf("Error reading input.\n");
00266 exit(1);
00267 }
00268
00269 lex->ibp = lex->inbuf;
00270 lex->col_adj = 0;
00271 lex->prev_valid = b_false;
00272 lex->current_valid = b_true;
00273 }
00274
00281 void lex_next(lex_t *lex)
00282 {
00283
00284 lex_touch(lex);
00285
00286
00287 lex->current_valid = b_false;
00288 }
00289
00298 lem_t *lex_get_current(lex_t *lex)
00299 {
00300 lex_touch(lex);
00301 return &lex->current;
00302 }
00303
00312 lem_t *lex_peek_prev(lex_t *lex)
00313 {
00314 if (lex->current_valid == b_false) {
00315
00316
00317
00318
00319 return &lex->current;
00320 }
00321
00322 if (lex->prev_valid != b_true) {
00323
00324 return NULL;
00325 }
00326
00327
00328
00329
00330
00331 return &lex->prev;
00332 }
00333
00338 static void lex_touch(lex_t *lex)
00339 {
00340 bool_t got_lem;
00341
00342 if (lex->current_valid == b_true)
00343 return;
00344
00345
00346 lex->prev = lex->current;
00347 lex->prev_valid = b_true;
00348
00349 do {
00350 got_lem = lex_read_try(lex);
00351 } while (got_lem == b_false);
00352
00353 lex->current_valid = b_true;
00354 }
00355
00367 static bool_t lex_read_try(lex_t *lex)
00368 {
00369 char *bp, *lsp;
00370 int line0, col0;
00371
00372 lex_skip_ws(lex);
00373
00374
00375
00376
00377
00378
00379
00380 line0 = input_get_line_no(lex->input);
00381 col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
00382
00383 lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
00384
00385 lsp = lex->ibp;
00386 bp = lex->ibp;
00387
00388 if (bp[0] == '\0') {
00389
00390 lex->current.lclass = lc_eof;
00391 goto finish;
00392 }
00393
00394 if (is_wstart(bp[0])) {
00395 lex_word(lex);
00396 goto finish;
00397 }
00398
00399 if (bp[0] == '\'') {
00400 lex_char(lex);
00401 goto finish;
00402 }
00403
00404 if (is_digit(bp[0])) {
00405 lex_number(lex);
00406 goto finish;
00407 }
00408
00409 if (bp[0] == '"') {
00410 lex_string(lex);
00411 goto finish;
00412 }
00413
00414 if (bp[0] == '-' && bp[1] == '-') {
00415 lex_skip_comment(lex);
00416
00417
00418 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
00419
00420
00421 return b_false;
00422 }
00423
00424 switch (bp[0]) {
00425 case ',': lex->current.lclass = lc_comma; ++bp; break;
00426 case ':': lex->current.lclass = lc_colon; ++bp; break;
00427 case ';': lex->current.lclass = lc_scolon; ++bp; break;
00428
00429 case '.': lex->current.lclass = lc_period; ++bp; break;
00430 case '/': lex->current.lclass = lc_slash; ++bp; break;
00431 case '(': lex->current.lclass = lc_lparen; ++bp; break;
00432 case ')': lex->current.lclass = lc_rparen; ++bp; break;
00433 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
00434 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
00435
00436 case '=':
00437 if (bp[1] == '=') {
00438 lex->current.lclass = lc_equal; bp += 2; break;
00439 }
00440 lex->current.lclass = lc_assign; ++bp; break;
00441
00442 case '!':
00443 if (bp[1] == '=') {
00444 lex->current.lclass = lc_notequal; bp += 2; break;
00445 }
00446 goto invalid;
00447
00448 case '+':
00449 if (bp[1] == '=') {
00450 lex->current.lclass = lc_increase; bp += 2; break;
00451 }
00452 lex->current.lclass = lc_plus; ++bp; break;
00453
00454 case '-':
00455 lex->current.lclass = lc_minus; ++bp; break;
00456
00457 case '*':
00458 lex->current.lclass = lc_mult; ++bp; break;
00459
00460 case '<':
00461 if (bp[1] == '=') {
00462 lex->current.lclass = lc_lt_equal; bp += 2; break;
00463 }
00464 lex->current.lclass = lc_lt; ++bp; break;
00465
00466 case '>':
00467 if (bp[1] == '=') {
00468 lex->current.lclass = lc_gt_equal; bp += 2; break;
00469 }
00470 lex->current.lclass = lc_gt; ++bp; break;
00471
00472 default:
00473 goto invalid;
00474 }
00475
00476 lex->ibp = bp;
00477
00478 finish:
00479
00480 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
00481 return b_true;
00482
00483 invalid:
00484 lex->current.lclass = lc_invalid;
00485 ++bp;
00486 lex->ibp = bp;
00487
00488 return b_true;
00489 }
00490
00498 static void lex_word(lex_t *lex)
00499 {
00500 struct lc_name *dp;
00501 char *bp;
00502 int idx;
00503
00504 bp = lex->ibp;
00505 ident_buf[0] = bp[0];
00506 idx = 1;
00507
00508 while (is_wcont(bp[idx])) {
00509 if (idx >= IBUF_SIZE) {
00510 printf("Error: Identifier too long.\n");
00511 exit(1);
00512 }
00513
00514 ident_buf[idx] = bp[idx];
00515 ++idx;
00516 }
00517
00518 lex->ibp = bp + idx;
00519
00520 ident_buf[idx] = '\0';
00521
00522 dp = keywords;
00523 while (dp->name != NULL) {
00524 if (os_str_cmp(ident_buf, dp->name) == 0) {
00525
00526 lex->current.lclass = dp->lclass;
00527 return;
00528 }
00529 ++dp;
00530 }
00531
00532
00533 lex->current.lclass = lc_ident;
00534 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
00535 }
00536
00543 static void lex_char(lex_t *lex)
00544 {
00545 size_t len;
00546 int char_val;
00547
00548 lex_char_string_core(lex, cs_chr);
00549
00550 len = os_str_length(strlit_buf);
00551 if (len != 1) {
00552 printf("Character literal should contain one character, "
00553 "but contains %u characters instead.\n", (unsigned) len);
00554 exit(1);
00555 }
00556
00557 os_str_get_char(strlit_buf, 0, &char_val);
00558 lex->current.lclass = lc_lit_char;
00559 bigint_init(&lex->current.u.lit_char.value, char_val);
00560 }
00561
00568 static void lex_number(lex_t *lex)
00569 {
00570 char *bp;
00571 bigint_t value;
00572 bigint_t dgval;
00573 bigint_t base;
00574 bigint_t tprod;
00575
00576 bp = lex->ibp;
00577
00578 bigint_init(&value, 0);
00579 bigint_init(&base, 10);
00580
00581 while (is_digit(*bp)) {
00582 bigint_mul(&value, &base, &tprod);
00583 bigint_init(&dgval, digit_value(*bp));
00584
00585 bigint_destroy(&value);
00586 bigint_add(&tprod, &dgval, &value);
00587 bigint_destroy(&tprod);
00588 bigint_destroy(&dgval);
00589
00590 ++bp;
00591 }
00592
00593 bigint_destroy(&base);
00594
00595 lex->ibp = bp;
00596
00597 lex->current.lclass = lc_lit_int;
00598 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
00599 }
00600
00607 static void lex_string(lex_t *lex)
00608 {
00609 lex_char_string_core(lex, cs_str);
00610
00611 lex->current.lclass = lc_lit_string;
00612 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
00613 }
00614
00615 static void lex_char_string_core(lex_t *lex, chr_str_t cs)
00616 {
00617 char *bp;
00618 int sidx, didx;
00619 char term;
00620 const char *descr, *cap_descr;
00621 char spchar;
00622
00623
00624 term = '\0';
00625 descr = NULL;
00626 cap_descr = NULL;
00627
00628 switch (cs) {
00629 case cs_chr:
00630 term = '\'';
00631 descr = "character";
00632 cap_descr = "Character";
00633 break;
00634 case cs_str:
00635 term = '"';
00636 descr = "string";
00637 cap_descr = "String";
00638 break;
00639 }
00640
00641 bp = lex->ibp + 1;
00642 sidx = didx = 0;
00643
00644 while (bp[sidx] != term) {
00645 if (didx >= SLBUF_SIZE) {
00646 printf("Error: %s literal too long.\n", cap_descr);
00647 exit(1);
00648 }
00649
00650 if (bp[sidx] == '\0') {
00651 printf("Error: Unterminated %s literal.\n", descr);
00652 exit(1);
00653 }
00654
00655 if (bp[sidx] == '\\') {
00656 switch (bp[sidx + 1]) {
00657 case '\\':
00658 spchar = '\\';
00659 break;
00660 case '\'':
00661 spchar = '\'';
00662 break;
00663 case '"':
00664 spchar = '"';
00665 break;
00666 case 'n':
00667 spchar = '\n';
00668 break;
00669 case 't':
00670 spchar = '\t';
00671 break;
00672 default:
00673 printf("Error: Unknown character escape sequence.\n");
00674 exit(1);
00675 }
00676
00677 strlit_buf[didx] = spchar;
00678 ++didx;
00679 sidx += 2;
00680 } else {
00681 strlit_buf[didx] = bp[sidx];
00682 ++sidx; ++didx;
00683 }
00684 }
00685
00686 lex->ibp = bp + sidx + 1;
00687
00688 strlit_buf[didx] = '\0';
00689 }
00690
00697 static void lex_skip_comment(lex_t *lex)
00698 {
00699 char *bp;
00700
00701 bp = lex->ibp + 2;
00702
00703 while (*bp != '\n' && *bp != '\0') {
00704 ++bp;
00705 }
00706
00707 lex->ibp = bp;
00708 }
00709
00716 static void lex_skip_ws(lex_t *lex)
00717 {
00718 char *bp;
00719 int rc;
00720
00721 bp = lex->ibp;
00722
00723 while (b_true) {
00724 while (*bp == ' ' || *bp == '\t') {
00725 if (*bp == '\t') {
00726
00727 lex->col_adj += (TAB_WIDTH - 1);
00728 }
00729 ++bp;
00730 }
00731
00732 if (*bp != '\n')
00733 break;
00734
00735
00736 rc = input_get_line(lex->input, &lex->inbuf);
00737 if (rc != EOK) {
00738 printf("Error reading input.\n");
00739 exit(1);
00740 }
00741
00742 bp = lex->inbuf;
00743 lex->col_adj = 0;
00744 }
00745
00746 lex->ibp = bp;
00747 }
00748
00754 static bool_t is_wstart(char c)
00755 {
00756 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
00757 (c == '_');
00758 }
00759
00766 static bool_t is_wcont(char c)
00767 {
00768 return is_digit(c) || is_wstart(c);
00769 }
00770
00776 static bool_t is_digit(char c)
00777 {
00778 return ((c >= '0') && (c <= '9'));
00779 }
00780
00786 static int digit_value(char c)
00787 {
00788 return (c - '0');
00789 }