#! /usr/bin/env python3
# This file is a part of the Pepper project, https://github.com/devosoft/Pepper
# (C) Michigan State University, under the MIT License
# See LICENSE.txt for more information
"""
This is the lexer for PEPPr
It's responsible for tokenizing the incoming character stream. The Parser will ingest the
token stream and build a tree, which will in turn produce actual c++ or c code.
"""
import sys
import ply.lex as lex
import argparse
import pepper.symbol_table as symtable
DEFAULT_LITERALS = ['+', '-', '*', '/', '(', ')',
'=', ',', '{', '}', '[', ']',
'.', ';', '!', '<', '>', ':', '~',
'@', '#', '&', "'", '%', "?"]
literals = DEFAULT_LITERALS
states = [
# recall there's also the default INITIAL state
('comment', 'exclusive')
]
PREPROCESSING_KEYWORDS = [
'include',
'define',
'ifdef',
'ifndef',
'endif',
'else',
'if',
'py',
]
tokens = [
'IDENTIFIER',
'NEWLINE',
'OTHER',
'PREPROCESSING_NUMBER',
'PUNCTUATOR',
# 'SKIPPED_LINE',
'STRING_LITERAL',
'WHITESPACE',
'LONG_COMMENT',
'SYSTEM_INCLUDE_LITERAL'
]
tokens.extend([f"PREPROCESSING_KEYWORD_{i.upper()}" for i in PREPROCESSING_KEYWORDS])
[docs]def t_PREPROCESSING_KEYWORD_PY(t):
r"\#py\b"
return t
[docs]def t_PREPROCESSING_KEYWORD_IFDEF(t):
r'\#ifdef\b'
return t
[docs]def t_PREPROCESSING_KEYWORD_IFNDEF(t):
r'\#ifndef\b'
return t
[docs]def t_PREPROCESSING_KEYWORD_ENDIF(t):
r'\#endif\b'
return t
[docs]def t_PREPROCESSING_KEYWORD_IF(t):
r'\#if\b'
[docs]def t_PREPROCESSING_KEYWORD_ELSE(t):
r'\#else\b'
return t
[docs]def t_PREPROCESSING_KEYWORD_INCLUDE(t):
r'\#include\b'
return t
[docs]def t_PREPROCESSING_KEYWORD_DEFINE(t):
r'\#define\b'
return t
[docs]def t_SYSTEM_INCLUDE_LITERAL(t):
r"""<[^\'\"<>]*?>"""
return t
[docs]def t_IDENTIFIER(t):
r'([_a-zA-Z][_a-zA-Z0-9]*(\.\.\.)?)|(\.\.\.)'
return t
[docs]def t_PREPROCESSING_NUMBER(t):
r'\.?[0-9]([0-9]|(e\+)|(e\-)|(E\+)|(E\-)|(p\+)|(p\-)|(P\+)|(P\-)|[a-zA-Z])*'
return t
[docs]def t_STRING_LITERAL(t):
r"""('((\\['tn])|[^'\\])*')|("((\\["tn])|[^"\\])*")"""
return t
# TODO: maybe convert this to a t_ignore() rule for improved lexing performance
[docs]def t_NEWLINE(t):
r"\n"
t.type = 'NEWLINE'
t.lexer.lineno += 1 # the lexer doesn't know what consistutes a 'line' unless we tell it
return t
[docs]def t_WHITESPACE(t):
r"[\t ]"
return t
[docs]def t_error(t):
raise symtable.PepperSyntaxError(f"Unknown token on line {t.lexer.lineno}: {t.value[0]}")
lexer = lex.lex()
ignore = ['WHITESPACE', 'NEWLINE']
[docs]def lex(lines, debug_mode=False):
"Takes a single string, containing newlines, that's the entire input"
# lexer.input("".join(ilines))
lexer.input(lines)
arcade = []
tok = True
while True:
tok = lexer.token()
if not tok:
break # end of file reached
arcade.append(tok)
for token in arcade:
try:
if token.type in ignore:
if debug_mode:
print(f"(IGNORED:) {token.type}: {token.value}")
else:
continue
elif token.type in literals:
print(f"ASCII_LITERAL: {token.value}")
elif token.type != 'UNKNOWN':
print(f"{token.type}: {token.value}")
else:
print(f"Unknown token in input: {token.value}")
sys.exit(1)
except: # NOQA
print(f'Blew up trying to access type of {token}')
return 0
[docs]def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('input_file',
type=argparse.FileType('r'),
default=sys.stdin,
help="The file to lex")
parser.add_argument('--debug_mode', action='store_true')
return parser.parse_args()
[docs]def main():
args = get_args()
lex(args.input_file.read(), args.debug_mode)
if __name__ == '__main__':
main()