const std = @import("std"); const Error = @import("./Error.zig"); const Token = @import("./Token.zig"); const ziglyph = @import("ziglyph"); const Normalizer = ziglyph.Normalizer; const Grapheme = ziglyph.Grapheme; const CodePoint = ziglyph.CodePoint; const GraphemeIterator = Grapheme.GraphemeIterator; const letter = ziglyph.letter; // or const letter = ziglyph.letter; const number = ziglyph.number; // or const number = ziglyph.number; const Self = @This(); allocator: std.mem.Allocator, source: []Grapheme, start: u32, index: u32, line: u32, pub fn init(allocator: std.mem.Allocator, source: []const u8) !Self { var normalizer = try Normalizer.init(allocator); defer normalizer.deinit(); var got_nfkc = try normalizer.nfkc(allocator, source); defer got_nfkc.deinit(); var graphemes = std.ArrayList(Grapheme).init(allocator); defer graphemes.deinit(); var iter = try GraphemeIterator.init(got_nfkc.slice); while (iter.next()) |grapheme| { try graphemes.append(grapheme); } return .{ .allocator = allocator, .source = try graphemes.toOwnedSlice(), .line = 1, .start = 0, .index = 0, }; } pub fn deinit(self: *Self) void { self.allocator.free(self.source); } pub fn scanTokens(self: *Self) ![]const Token { var list = std.ArrayList(Token).init(self.allocator); defer { for (list.items) |token| { self.allocator.free(token.lexeme); } list.deinit(); } while (!self.isAtEnd()) { self.start = self.index; try self.scanToken(&list); } try list.append(.{ .token_type = .eof, .lexeme = "", .line = self.line, .literal = null, }); return list.toOwnedSlice(); } fn isAtEnd(self: *Self) bool { return self.index >= self.source.len; } fn codepointize(grapheme: Grapheme) !u21 { return std.unicode.utf8Decode(grapheme.bytes); } inline fn peek(self: *Self) !u21 { if (self.isAtEnd()) { return 0; } return codepointize(self.source[self.index]); } fn match(self: *Self, expected: u21) !bool { if (self.isAtEnd()) return false; if (try self.peek() != expected) return false; self.index += 1; return true; } fn advance(self: *Self) !u21 { const grapheme = self.source[self.index]; self.index += 1; return codepointize(grapheme); } fn createToken(self: *Self, token_type: Token.Type, literal: ?Token.Literal) !Token { var text = std.ArrayList(u8).init(self.allocator); defer text.deinit(); for (self.source[self.start..self.index]) |grapheme| { try text.appendSlice(grapheme.bytes); } return .{ .token_type = token_type, .literal = literal, .lexeme = try text.toOwnedSlice(), .line = self.line, }; } const KeywordEntry = struct { kw: Token.Type, text: []const u8, }; const keywords = [_]KeywordEntry{ .{ .kw = .kw_var, .text = "var" }, }; fn identOrKeyword(self: *Self, ini: u21) !Token { var ident = std.ArrayList(u21).init(self.allocator); defer ident.deinit(); try ident.append(ini); while (ziglyph.isAlphaNum(try self.peek())) { try ident.append(try self.advance()); } var ident_string = std.ArrayList(u8).init(self.allocator); const writer = ident_string.writer(); defer ident_string.deinit(); for (ident.items) |cp| { try writer.print("{u}", .{cp}); } return for (keywords) |kwe| { if (std.mem.eql(u8, ident_string.items, kwe.text)) { break try self.createToken(kwe.kw, null); } } else try self.createToken(.identifier, null); } fn identifyToken(self: *Self, cp: u21) !?Token { switch (cp) { '(' => return try self.createToken(.left_paren, null), ')' => return try self.createToken(.right_paren, null), '{' => return try self.createToken(.left_brace, null), '}' => return try self.createToken(.right_brace, null), ',' => return try self.createToken(.comma, null), '.' => return try self.createToken(.dot, null), '-' => return try self.createToken(.minus, null), '+' => return try self.createToken(.plus, null), ';' => return try self.createToken(.semicolon, null), '*' => return try self.createToken(.star, null), '!' => if (try self.match('=')) { return try self.createToken(.bang_equal, null); } else { return try self.createToken(.bang, null); }, '=' => if (try self.match('=')) { return try self.createToken(.equal_equal, null); } else { return try self.createToken(.equal, null); }, '>' => if (try self.match('=')) { return try self.createToken(.greater_equal, null); } else { return try self.createToken(.greater, null); }, '<' => if (try self.match('=')) { return try self.createToken(.less_equal, null); } else { return try self.createToken(.less, null); }, '/' => if (try self.match('/')) { while (try self.peek() != '\n' and !self.isAtEnd()) { _ = try self.advance(); } return try self.createToken(.comment, null); } else { return try self.createToken(.slash, null); }, else => { if (ziglyph.isLetter(cp)) { return try self.identOrKeyword(cp); } return null; }, } } fn scanToken(self: *Self, list: *std.ArrayList(Token)) !void { var c = try self.advance(); var token = try self.identifyToken(c); var unknown = std.ArrayList(u21).init(self.allocator); defer unknown.deinit(); while (token == null) { try unknown.append(c); self.start = self.index; // Move up the start of the token (the previous start is unknown) if (!self.isAtEnd()) { c = try self.advance(); token = try self.identifyToken(c); } else { break; } } if (unknown.items.len > 0) { var unknown_string = std.ArrayList(u8).init(self.allocator); const string_writer = unknown_string.writer(); defer unknown_string.deinit(); for (unknown.items) |ucp| { try string_writer.print("{u}", .{ucp}); } try Error.report(self.line, "lexer", "Unknown codepoints '{}'", .{std.zig.fmtEscapes(unknown_string.items)}); } if (token) |atoken| { try list.append(atoken); } }