-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.l
476 lines (434 loc) · 16.6 KB
/
lexer.l
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
%{
#include "lexer_common.h"
%}
%option outfile="lexer.c" header-file="lexer.h"
%option warn nodefault
%option reentrant noyywrap never-interactive nounistd
%option bison-bridge
%e 1019
%p 2807
%n 371
%k 284
%a 1213
%o 1117
O [0-7]
D [0-9]
NZ [1-9]
L [a-zA-Z_]
A [a-zA-Z_0-9]
H [a-fA-F0-9]
HP (0[xX])
E ([Ee][+-]?{D}+)
P ([Pp][+-]?{D}+)
FS (f|F|l|L)
IS (((u|U)(l|L|ll|LL)?)|((l|L|ll|LL)(u|U)?))
CP (u|U|L)
SP (u8|u|U|L)
ES (\\(['"\?\\abfnrtv]|[0-7]{1,3}|x[a-fA-F0-9]+))
WS [ \t\v\n\f]
%{
extern void yyerror(const char *); /* prints grammar violation message */
extern int sym_type(const char *); /* returns type from symbol table */
#define sym_type(identifier) IDENTIFIER /* with no symbol table, fake it */
static void consume_comment(yyscan_t yyscanner);
static int check_type(const char* text);
%}
%%
"/*" { consume_comment(yyscanner); }
"//".* { /* consume //-comment */ }
"auto" { return app3(yylval->token, AUTO, yytext); }
"break" { return app3(yylval->token, BREAK, yytext); }
"case" { return app3(yylval->token, CASE, yytext); }
"char" { return app3(yylval->token, CHAR, yytext); }
"const" { return app3(yylval->token, CONST, yytext); }
"continue" { return app3(yylval->token, CONTINUE, yytext); }
"default" { return app3(yylval->token, DEFAULT, yytext); }
"do" { return app3(yylval->token, DO, yytext); }
"double" { return app3(yylval->token, DOUBLE, yytext); }
"else" { return app3(yylval->token, ELSE, yytext); }
"enum" { return app3(yylval->token, ENUM, yytext); }
"extern" { return app3(yylval->token, EXTERN, yytext); }
"float" { return app3(yylval->token, FLOAT, yytext); }
"for" { return app3(yylval->token, FOR, yytext); }
"goto" { return app3(yylval->token, GOTO, yytext); }
"if" { return app3(yylval->token, IF, yytext); }
"inline" { return app3(yylval->token, INLINE, yytext); }
"int" { return app3(yylval->token, INT, yytext); }
"long" { return app3(yylval->token, LONG, yytext); }
"register" { return app3(yylval->token, REGISTER, yytext); }
"restrict" { return app3(yylval->token, RESTRICT, yytext); }
"return" { return app3(yylval->token, RETURN, yytext); }
"short" { return app3(yylval->token, SHORT, yytext); }
"signed" { return app3(yylval->token, SIGNED, yytext); }
"sizeof" { return app3(yylval->token, SIZEOF, yytext); }
"static" { return app3(yylval->token, STATIC, yytext); }
"struct" { return app3(yylval->token, STRUCT, yytext); }
"switch" { return app3(yylval->token, SWITCH, yytext); }
"typedef" { recent_typedef_keyword = states.get_brack_count();
braces_since_typedef = 0;
return app3(yylval->token, TYPEDEF, yytext); }
"union" { return app3(yylval->token, UNION, yytext); }
"unsigned" { return app3(yylval->token, UNSIGNED, yytext); }
"void" { return app3(yylval->token, VOID, yytext); }
"volatile" { return app3(yylval->token, VOLATILE, yytext); }
"while" { return app3(yylval->token, WHILE, yytext); }
"_Alignas" { return app3(yylval->token, ALIGNAS, yytext); }
"_Alignof" { return app3(yylval->token, ALIGNOF, yytext); }
"_Atomic" { return app3(yylval->token, ATOMIC, yytext); }
"_Bool" { return app3(yylval->token, BOOL, yytext); }
"_Complex" { return app3(yylval->token, COMPLEX, yytext); }
"_Generic" { return app3(yylval->token, GENERIC, yytext); }
"_Imaginary" { return app3(yylval->token, IMAGINARY, yytext); }
"_Noreturn" { return app3(yylval->token, NORETURN, yytext); }
"_Static_assert" { return app3(yylval->token, STATIC_ASSERT, yytext); }
"_Thread_local" { return app3(yylval->token, THREAD_LOCAL, yytext); }
"__func__" { return app3(yylval->token, FUNC_NAME, yytext); }
"__FUNCTION__" { return app3(yylval->token, FUNCTION_NAME, yytext); }
"__attribute__" { return app3(yylval->token, ATTRIBUTE, yytext); }
{L}{A}* {
// it should rather be: set_state (IDENTIFIER_OR_ENUMERATOR_OR...)
// nonetheless, it should be clear what was meant here.
states.set_state(yytext, MAYBE_IDENTIFIER);
// this could only be set to true if a . or -> preceded
// thus, this is the only place to set it to false
recent_struct_access = false;
// find end of identifier
const char* end = yytext + 1;
for(; isalnum(*end) || *end == '_'; ++end) ;
std::size_t length = end - yytext;
cnt();
char* res = new char[length + 1]; res[length] = 0; /*std::copy(yytext, end, yylval->name);*/ strncpy(res, yytext, length);
int type = check_type(yytext);
switch(type)
{
case IDENTIFIER:
#ifdef LEXER_DEBUG
std::cout << " -> IDENTIFIER: " << res << std::endl;
#endif
return app(yylval->name = new identifier_t(res, get_pos()), type);
case TYPEDEF_NAME:
#ifdef LEXER_DEBUG
std::cout << " -> TYPEDEF NAME: " << res << std::endl;
#endif
return app(yylval->typedef_name = new typedef_name_t(res, get_pos()), type);
case ENUMERATION_CONSTANT:
#ifdef LEXER_DEBUG
std::cout << " -> ENUMERATION_CONSTANT: " << res << std::endl;
#endif
return app(yylval->enumeration_constant = new enumeration_constant_t(res, get_pos()), type);
case ATTR_NAME:
#ifdef LEXER_DEBUG
std::cout << " -> ATTR_NAME: " << res << std::endl;
#endif
return app(yylval->attr_name = new attr_name_t(res, get_pos()), type);
}
}
{HP}{H}+{IS}? { return app_int(yylval->iconstant, yytext, 'x', yyleng); }
{NZ}{D}*{IS}? { return app_int(yylval->iconstant, yytext, 'd', yyleng); }
"0"{O}*{IS}? { return app_int(yylval->iconstant, yytext, 'o', yyleng); }
{CP}?"'"([^'\\\n]|{ES})+"'" { const char* p = yytext; if(icmp(p, 'u') || *p == 'L') ++p;
++p; // '
// ES = (\\(['"\?\\abfnrtv]|[0-7]{1,3}|x[a-fA-F0-9]+))
bool ok = true;
do
{
if(*p != '\'' && *p != '\\' && *p != '\n') // exclude single chars
++p;
else if(*p == '\\')
{
skip_esc_seq(p);
}
else // must be '
{
if(*p != '\'')
throw std::runtime_error("lexer error for char parsing");
ok = false; // end found
}
} while(ok);
++p; // '
if(*p) throw std::logic_error("end of token not 0");
return app_with_string(yylval->iconstant, I_CONSTANT, yytext, yyleng); }
{D}+{E}{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
{D}*"."{D}+{E}?{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
{D}+"."{E}?{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
{HP}{H}+{P}{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
{HP}{H}*"."{H}+{P}{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
{HP}{H}+"."{P}{FS}? { return app_float(yylval->fconstant, yytext, yyleng); }
({SP}?\"([^"\\\n]|{ES})*\"{WS}*)+ { const char* p = yytext;
do
{
if(*p == 'u' || *p == 'U' || *p == 'L') ++p;
++p; // "
if(*p == '8')
++p;
bool ok = true;
do
{
if(*p != '"' && *p != '\\' && *p != '\n') // exclude single chars
++p;
else if(*p == '\\')
{
skip_esc_seq(p);
}
else // must be "
{
if(*p != '"')
throw std::runtime_error("lexer error for char parsing");
ok = false; // end found
}
} while(ok);
++p; // "'"
for(; *p == ' ' || *p == '\t' || *p == '\n' || *p == '\v' || *p == '\f'; ++p);
} while (*p == '"'
|| (*p == 'u' && (*(p+1) == '"'
|| (*(p+1) == '8' && *(p+2) == '"') ))
|| (*p == 'U' && *(p+1) == '"')
|| (*p == 'L' && *(p+1) == '"')
); // only strings start like this
std::size_t length = p - yytext;
char* res = new char[length + 1]; res[length] = 0; /*std::copy(yytext, end, yylval->name);*/ strncpy(res, yytext, length);
if(*p) throw std::logic_error("end of token not 0");
// FEATURE: common routine with identifier
return app_with_string(yylval->string_literal, STRING_LITERAL, yytext, yyleng); }
"..." { return app3(yylval->token, ELLIPSIS, yytext); }
">>=" { return app3(yylval->token, RIGHT_ASSIGN, yytext); }
"<<=" { return app3(yylval->token, LEFT_ASSIGN, yytext); }
"+=" { return app3(yylval->token, ADD_ASSIGN, yytext); }
"-=" { return app3(yylval->token, SUB_ASSIGN, yytext); }
"*=" { return app3(yylval->token, MUL_ASSIGN, yytext); }
"/=" { return app3(yylval->token, DIV_ASSIGN, yytext); }
"%=" { return app3(yylval->token, MOD_ASSIGN, yytext); }
"&=" { return app3(yylval->token, AND_ASSIGN, yytext); }
"^=" { return app3(yylval->token, XOR_ASSIGN, yytext); }
"|=" { return app3(yylval->token, OR_ASSIGN, yytext); }
">>" { return app3(yylval->token, RIGHT_OP, yytext); }
"<<" { return app3(yylval->token, LEFT_OP, yytext); }
"++" { return app3(yylval->token, INC_OP, yytext); }
"--" { return app3(yylval->token, DEC_OP, yytext); }
"->" { recent_struct_access = true; return app3(yylval->token, PTR_OP, yytext); }
"&&" { return app3(yylval->token, AND_OP, yytext); }
"||" { return app3(yylval->token, OR_OP, yytext); }
"<=" { return app3(yylval->token, LE_OP, yytext); }
">=" { return app3(yylval->token, GE_OP, yytext); }
"==" { return app3(yylval->token, EQ_OP, yytext); }
"!=" { return app3(yylval->token, NE_OP, yytext); }
";" { return app3(yylval->token, ';', yytext); }
("{"|"<%") {
if(recent_tokens[0] == ENUM || recent_tokens[-1] == ENUM)
in_enum_block = true;
++braces_since_typedef;
return app3(yylval->token, '{', yytext); }
("}"|"%>") {
if(in_enum_block)
in_enum_block = false;
--braces_since_typedef;
return app3(yylval->token, '}', yytext);
}
"," { return app3(yylval->token, ',', yytext); }
":" { return app3(yylval->token, ':', yytext); }
"=" { return app3(yylval->token, '=', yytext); }
"(" { return app3(yylval->token, '(', yytext); }
")" { return app3(yylval->token, ')', yytext); }
("["|"<:") { return app3(yylval->token, '[', yytext); }
("]"|":>") { return app3(yylval->token, ']', yytext); }
"." { recent_struct_access = true; return app3(yylval->token, '.', yytext); }
"&" { return app3(yylval->token, '&', yytext); }
"!" { return app3(yylval->token, '!', yytext); }
"~" { return app3(yylval->token, '~', yytext); }
"-" { return app3(yylval->token, '-', yytext); }
"+" { return app3(yylval->token, '+', yytext); }
"*" { return app3(yylval->token, '*', yytext); }
"/" { return app3(yylval->token, '/', yytext); }
"%" { return app3(yylval->token, '%', yytext); }
"<" { return app3(yylval->token, '<', yytext); }
">" { return app3(yylval->token, '>', yytext); }
"^" { return app3(yylval->token, '^', yytext); }
"|" { return app3(yylval->token, '|', yytext); }
"?" { return app3(yylval->token, '?', yytext); }
"\t" { app2(yylval->token, '\t'); }
"\n" { app2(yylval->token, '\n'); }
"\v" { app2(yylval->token, '\v'); }
"\f" { app2(yylval->token, '\f'); }
" " { app2(yylval->token, ' '); }
"#"([^\\\n]*\\[ \t\v\f]*"\n")*[^\\\n]*"\n" { pos_counter::parse_ppline(yytext); }
. { std::cerr << "At: " << yytext << std::endl; throw std::runtime_error("bad character"); }
%%
#if 0
int yywrap(void) /* called at end of input */
{
return 1; /* terminate now */
}
#endif
static void consume_comment(yyscan_t yyscanner)
{
int c;
std::string com_str = "/*";
while ((c = yyinput(yyscanner)) != 0)
{
com_str += (char)c;
if (c == '*')
{
while ((c = yyinput(yyscanner)) == '*')
com_str += c;
com_str += c;
if (c == '/') {
app_comment(com_str);
return;
}
if (c == 0)
break;
}
}
yyerror("unterminated comment");
}
static int check_type(const char* text) // FEATURE: lookup_table function?
{
#ifdef LEXER_DEBUG
std::cout << "STATE NOW: " << declaration_state << std::endl;
#endif
// attribute names go separate
if(recent_tokens[-3] == ATTRIBUTE ||
recent_tokens[-4] == ATTRIBUTE ||
recent_tokens[-5] == ATTRIBUTE) // more than 3 attributes => parse error
return ATTR_NAME;
/*
Grammar: Each type_specifier is followed by either '<declaration_specifier>*['('<type_qualifier_list>*]*IDENTIFIER' => declaration,
Grammar: if it's followed by something else it's not a declaration
i.e. <this regexp> => declaration
The C standard forces that at least one declaration specifier is a type specifier
(http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf 6.7.2.2)
Thus:
type_specifier + regex + IDENTIFIER (grammar)=> declaration, otherwise => no declaration
declaration (C standard)=> type_specifier (above)=> type_specifier + regex + IDENTIFIER
Finally:
type_specifier + regex + IDENTIFIER (grammar) <=> declaration
*/
else if(recent_tokens[-1] == '.' || recent_tokens[-1] == PTR_OP)
{
return IDENTIFIER;
}
else if(recent_tokens[-1] == STRUCT || recent_tokens[-1] == UNION || recent_tokens[-1] == ENUM)
{
states.recent_was_flagged_unknown = text;
states.maybe_struct_definition();
//states.flag_symbol(text, lt_struct_bound);
return IDENTIFIER;
}
/*
first of all, check if this is declaration
lookup table does not matter, since this could be an identifier with the same name
this comparison is reliable in all cases:
*/
else if(states.enum_state() == 1)
{
// the last IF found *all* declarations, except all inside of enum {}
// find these manually, this is easy
states.flag_symbol(text,lt_enumeration);
#ifdef LEXER_DEBUG
std::cout << " -> declaration, enum" << std::endl;
#endif
return IDENTIFIER;
}
else if(declaration_state == declarator_found)
{
// typedefs can not occur in structs
if(recent_typedef_keyword == states.get_brack_count())
{
states.flag_symbol(text, lt_typedef_name);
recent_typedef_keyword = -1;
}
else
states.flag_symbol(text, lt_identifier);
// declaration_state = expect_type_specifier; // reset for next declaration
#ifdef LEXER_DEBUG
std::cout << " -> declaration, no braces" << std::endl;
#endif
declaration_state = expect_initializer_or_comma;
return IDENTIFIER;
}
#if 0
else if(states.get_brack_count())
{
// the last IF found *all* declarations, except all inside of enum {} and struct {}
// find these manually, this is easy
states.flag_symbol(text,
(in_enum_block &&
(recent_tokens[-1] == '{' ||
recent_tokens[-1] == ',') )
? lt_enumeration
: lt_identifier);
// TODO: the comma might be from the comma op ! error !
std::cout << " -> declaration, braces" << std::endl;
return IDENTIFIER;
}
#endif
/* else if(states.declaration_state_pars_after > 0)
{
// must be a function. all identifiers must be declarators
states.flag_symbol(text, lt_identifier);
#ifdef LEXER_DEBUG
std::cout << " -> declaration, function parameter" << std::endl;
#endif
return IDENTIFIER;
}*/
else switch(lookup_table_t::type_of(text))
{
// now it really can't be a declaration, so it must be known
case lt_typedef_name: /* previously defined */
return TYPEDEF_NAME;
case lt_enumeration: /* previously defined */
return ENUMERATION_CONSTANT;
case lt_identifier:
case lt_struct_bound:
return IDENTIFIER;
case lt_undefined: /* undefined or identifier list */
if(allow_undefined) /* in this mode, identifier lists are forbidden */
return IDENTIFIER; /* best guess */
else /* undefined are forbidden */
{ /* fall through => flag as identifier list */ }
default:
states.flag_symbol(text, lt_identifier_list);
states.recent_was_flagged_unknown = text;
return IDENTIFIER;
//throw "This is neither a declaration, nor is this type known.";
}
#if 0
lookup_type lt = lookup_table_t::type_of(text);
if(lt == lt_undefined)
{
// the braces check is a stupid fix
// however, it covers all problematic cases, like
// typedef struct { int x } s;
if(recent_typedef_keyword && (braces_since_typedef == 0))
{
lt = lt_typedef_name;
recent_typedef_keyword=false;
}
else if(in_enum_block &&
(get_token_vector().back()->value() == '{' ||
get_token_vector().back()->value() == ',') )
{
lt = lt_enumeration;
}
else // everything else...
lt = lt_identifier;
states.flag_symbol(text, lt);
// since lt == lt_undefined, this is the definition
// in all case, for definitions, return IDENTIFIER
// for later use, return the real symbol type
return IDENTIFIER;
}
else switch(lt)
{
case lt_typedef_name: /* previously defined */
return TYPEDEF_NAME;
case lt_enumeration: /* previously defined */
return ENUMERATION_CONSTANT;
default: /* includes undefined */
return IDENTIFIER;
}
#endif
}
void yyerror(const char *msg) {
fprintf(stderr,"Error: %s\n",msg);
}