package tokenizer import "core:fmt" import "core:strings" import "core:unicode/utf8" Set :: distinct map[string]bool; // TODO: Use an arena allocator Tokenizer :: struct { source: string, keywords: ^Set, ident_allowWhitespace: bool, tokenContext: TokenizerContext, line, col: int, newLine: bool, // TODO backslashEscape: bool, // TODO quoted: bool, currentIndex: int, currentRune: rune, currentRuneSize: int, tokenStart: int, tokens: [dynamic]Token, } TokenizerContext :: enum u8 { None, Number, Identifier, QuotedIdentifier, Char, String, RawString, } TokenType :: enum u16 { // 0 - 255 are the ascii characters Null = 0, HorizontalTab = 9, ESC = 27, Exclamation = 33, Hash = 35, DollarSign = 36, Percent = 37, Ampersand = 38, LeftParen = 40, RightParen = 41, Astrisk = 42, Plus = 43, Comma = 44, Dash = 45, Dot = 46, Slash = 47, Colon = 58, Semicolon = 59, LessThan = 60, Equal = 61, GreaterThan = 62, QuestionMark = 63, At = 64, LeftBracket = 91, Backslash = 92, RightBracket = 93, Hat = 94, // ^ Unerscore = 95, Backtick = 96, // ` LeftCurlyBracket = 123, VerticalBar = 124, // | RightCurlyBracket = 125, Whitespace = 256, Identifier, QuotedIdentifier, Keyword, Number, String, RawString, Char, End, } Token :: struct { type: TokenType, str: string, line, col: int, } makeToken :: proc(type: TokenType, str: string, line, col: int) -> Token { token: Token; token.type = type; token.str = str; token.line = line; token.col = col - len(str) + 1; return token; } makeTokenizer :: proc(source: string, keywords: ^Set, ident_allowWhitespace: bool = false) -> Tokenizer { tokenizer: Tokenizer; tokenizer.tokenContext = TokenizerContext.None; tokenizer.source = source; tokenizer.line = 1; tokenizer.col = 0; tokenizer.tokens = make([dynamic]Token, 0); tokenizer.keywords = keywords; tokenizer.ident_allowWhitespace = ident_allowWhitespace; return tokenizer; } destroyTokenizer :: proc(tok: ^Tokenizer) { delete(tok.tokens); } is_newline :: proc(r: rune) -> bool { switch r { case '\n', '\r': return true; case: return false; } } // ----- tokenize :: proc(tok: ^Tokenizer) { newLine: bool = false; for r, i in tok.source { tok.currentIndex = i; tok.currentRune = r; tok.currentRuneSize = utf8.rune_size(r); if is_newline(r) && !newLine { tok.line += 1; tok.col = 0; newLine = true; continue; } else { tok.col += 1; newLine = false; } switch tok.tokenContext { case .None: handleNone(tok); case .Number: handleNumber(tok); case .Identifier: handleIdentifier(tok); case .QuotedIdentifier: handleIdentifier(tok, true); case .Char: handleChar(tok); case .String: handleString(tok, false); case .RawString: handleString(tok, true); } } // End of file/input tok.currentIndex += 1; tok.currentRune = '\x00'; tok.currentRuneSize = 1; switch tok.tokenContext { case .None: handleNone(tok); case .Number: handleNumber(tok); case .Identifier: handleIdentifier(tok); case .QuotedIdentifier: handleIdentifier(tok, true); case .Char: handleChar(tok); case .String: handleString(tok, false); case .RawString: handleString(tok, true); } // End token endToken := makeToken(TokenType.End, tok.source[tok.currentIndex:], tok.line, tok.col); append(&tok.tokens, endToken); } printTokens :: proc(tok: ^Tokenizer) { for token in tok.tokens { fmt.println(token); if token.type == TokenType.Semicolon { fmt.println(""); } } } handleNone :: proc(using tok: ^Tokenizer) { // Skip Whitespace if strings.is_space(currentRune) do return; switch currentRune { case 'a'..'z', 'A'..'Z': { tokenStart = currentIndex; if quoted do tokenContext = TokenizerContext.QuotedIdentifier; else do tokenContext = TokenizerContext.Identifier; } case '(': { token := makeToken(TokenType.LeftParen, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case ')': { token := makeToken(TokenType.RightParen, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '{': { token := makeToken(TokenType.LeftCurlyBracket, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '}': { token := makeToken(TokenType.RightCurlyBracket, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case ':': { token := makeToken(TokenType.Colon, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case ';': { token := makeToken(TokenType.Semicolon, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case ',': { token := makeToken(TokenType.Comma, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '"': { tokenStart = currentIndex; tokenContext = TokenizerContext.String; } case '`': { tokenStart = currentIndex; tokenContext = TokenizerContext.RawString; } case '\'': { tokenStart = currentIndex; tokenContext = TokenizerContext.Char; } case '0'..'9': { tokenStart = currentIndex; tokenContext = TokenizerContext.Number; } case '-': // TODO case '+': { token := makeToken(TokenType.Plus, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '*': { token := makeToken(TokenType.Astrisk, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '/': { token := makeToken(TokenType.Slash, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '\\': { token := makeToken(TokenType.Backslash, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '^': { token := makeToken(TokenType.Hat, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '.': { token := makeToken(TokenType.Dot, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '=': { token := makeToken(TokenType.Equal, source[currentIndex:currentIndex + 1], line, col); append(&tokens, token); } case '$': { quoted = true; return; } case: { } } if quoted do quoted = false; } handleIdentifier :: proc(using tok: ^Tokenizer, quotedIdentifier: bool = false) { // Allow whitespace in identifiers if tok.ident_allowWhitespace && strings.is_space(currentRune) do return; switch(currentRune) { case 'a'..'z', 'A'..'Z', '0'..'9', '_', '-': { return; } case: { type: TokenType = TokenType.Identifier; if quotedIdentifier do type = TokenType.QuotedIdentifier; str := source[tokenStart:currentIndex]; if tok.keywords[str] { type = TokenType.Keyword; } token := makeToken(type, str, line, col); append(&tokens, token); tokenContext = TokenizerContext.None; handleNone(tok); } } } handleString :: proc(using tok: ^Tokenizer, raw: bool = false) { // Allow whitespace in strings if strings.is_space(currentRune) do return; if currentRune == '"' && !raw { token := makeToken(TokenType.String, source[tokenStart:currentIndex + 1], line, col); append(&tokens, token); tokenContext = TokenizerContext.None; } else if currentRune == '`' && raw { token := makeToken(TokenType.RawString, source[tokenStart:currentIndex + 1], line, col); append(&tokens, token); tokenContext = TokenizerContext.None; } } // TODO: Error on more than one character in char literal handleChar :: proc(using tok: ^Tokenizer) { if currentRune == '\'' { token := makeToken(TokenType.Char, source[tokenStart:currentIndex + 1], line, col); append(&tokens, token); tokenContext = TokenizerContext.None; } } handleNumber :: proc(using tok: ^Tokenizer) { switch currentRune { case '0'..'9', '.': { return; } case: { // Note: Whitespace *not* allowed token := makeToken(TokenType.Number, source[tokenStart:currentIndex], line, col); append(&tokens, token); tokenContext = TokenizerContext.None; handleNone(tok); } } }