Source code for pepper.lexer

#! /usr/bin/env python3

# This file is a part of the Pepper project, https://github.com/devosoft/Pepper
# (C) Michigan State University, under the MIT License
# See LICENSE.txt for more information

"""
This is the lexer for PEPPr

It's responsible for tokenizing the incoming character stream. The Parser will ingest the
token stream and build a tree, which will in turn produce actual c++ or c code.
"""

import sys
import ply.lex as lex
import argparse
import pepper.symbol_table as symtable

DEFAULT_LITERALS = ['+', '-', '*', '/', '(', ')',
                    '=', ',', '{', '}', '[', ']',
                    '.', ';', '!', '<', '>', ':', '~',
                    '@', '#', '&', "'", '%', "?"]


literals = DEFAULT_LITERALS

states = [
    # recall there's also the default INITIAL state
    ('comment', 'exclusive')
]

PREPROCESSING_KEYWORDS = [
    'include',
    'define',
    'ifdef',
    'ifndef',
    'endif',
    'else',
    'if',
    'py',
]

tokens = [
    'IDENTIFIER',
    'NEWLINE',
    'OTHER',
    'PREPROCESSING_NUMBER',
    'PUNCTUATOR',
    # 'SKIPPED_LINE',
    'STRING_LITERAL',
    'WHITESPACE',
    'LONG_COMMENT',
    'SYSTEM_INCLUDE_LITERAL'
]

tokens.extend([f"PREPROCESSING_KEYWORD_{i.upper()}" for i in PREPROCESSING_KEYWORDS])


[docs]def t_PREPROCESSING_KEYWORD_PY(t): r"\#py\b" return t
[docs]def t_COMMENT(t): r"\s//.*" pass
[docs]def t_COMMENT_NO_WHITESPACE(t): r"//.*" pass
[docs]def t_PREPROCESSING_KEYWORD_IFDEF(t): r'\#ifdef\b' return t
[docs]def t_PREPROCESSING_KEYWORD_IFNDEF(t): r'\#ifndef\b' return t
[docs]def t_PREPROCESSING_KEYWORD_ENDIF(t): r'\#endif\b' return t
[docs]def t_PREPROCESSING_KEYWORD_IF(t): r'\#if\b'
[docs]def t_PREPROCESSING_KEYWORD_ELSE(t): r'\#else\b' return t
[docs]def t_PREPROCESSING_KEYWORD_INCLUDE(t): r'\#include\b' return t
[docs]def t_PREPROCESSING_KEYWORD_DEFINE(t): r'\#define\b' return t
[docs]def t_SYSTEM_INCLUDE_LITERAL(t): r"""<[^\'\"<>]*?>""" return t
[docs]def t_IDENTIFIER(t): r'([_a-zA-Z][_a-zA-Z0-9]*(\.\.\.)?)|(\.\.\.)' return t
[docs]def t_PREPROCESSING_NUMBER(t): r'\.?[0-9]([0-9]|(e\+)|(e\-)|(E\+)|(E\-)|(p\+)|(p\-)|(P\+)|(P\-)|[a-zA-Z])*' return t
[docs]def t_STRING_LITERAL(t): r"""('((\\['tn])|[^'\\])*')|("((\\["tn])|[^"\\])*")""" return t
[docs]def t_LONG_COMMENT_START(t): r"\/\*" t.lexer.begin('comment') pass
[docs]def t_comment_BLOCK_COMMENT_END(t): r"\*\/" t.lexer.begin('INITIAL') # reset to initial state pass
[docs]def t_comment_ignore_anything_else(t): r".+?" pass
[docs]def t_comment_NEWLINE(t): r'\n' t.lexer.lineno += 1 # the lexer doesn't know what consistutes a 'line' unless we tell it return t
[docs]def t_comment_error(t): raise symtable.PepperSyntaxError(f"Unknown token on line {t.lexer.lineno}: {t.value[0]}")
# TODO: maybe convert this to a t_ignore() rule for improved lexing performance
[docs]def t_NEWLINE(t): r"\n" t.type = 'NEWLINE' t.lexer.lineno += 1 # the lexer doesn't know what consistutes a 'line' unless we tell it return t
[docs]def t_WHITESPACE(t): r"[\t ]" return t
[docs]def t_error(t): raise symtable.PepperSyntaxError(f"Unknown token on line {t.lexer.lineno}: {t.value[0]}")
lexer = lex.lex() ignore = ['WHITESPACE', 'NEWLINE']
[docs]def lex(lines, debug_mode=False): "Takes a single string, containing newlines, that's the entire input" # lexer.input("".join(ilines)) lexer.input(lines) arcade = [] tok = True while True: tok = lexer.token() if not tok: break # end of file reached arcade.append(tok) for token in arcade: try: if token.type in ignore: if debug_mode: print(f"(IGNORED:) {token.type}: {token.value}") else: continue elif token.type in literals: print(f"ASCII_LITERAL: {token.value}") elif token.type != 'UNKNOWN': print(f"{token.type}: {token.value}") else: print(f"Unknown token in input: {token.value}") sys.exit(1) except: # NOQA print(f'Blew up trying to access type of {token}') return 0
[docs]def get_args(): parser = argparse.ArgumentParser() parser.add_argument('input_file', type=argparse.FileType('r'), default=sys.stdin, help="The file to lex") parser.add_argument('--debug_mode', action='store_true') return parser.parse_args()
[docs]def main(): args = get_args() lex(args.input_file.read(), args.debug_mode)
if __name__ == '__main__': main()