const std = @import("std");
const Error = @import("./Error.zig");
const Token = @import("./Token.zig");
const ziglyph = @import("ziglyph");
const Normalizer = ziglyph.Normalizer;
const Grapheme = ziglyph.Grapheme;
const CodePoint = ziglyph.CodePoint;
const GraphemeIterator = Grapheme.GraphemeIterator;
const letter = ziglyph.letter; // or const letter = ziglyph.letter;
const number = ziglyph.number; // or const number = ziglyph.number;
const Self = @This();
allocator: std.mem.Allocator,
source: []Grapheme,
start: u32,
index: u32,
line: u32,
pub fn init(allocator: std.mem.Allocator, source: []const u8) !Self {
var normalizer = try Normalizer.init(allocator);
defer normalizer.deinit();
var got_nfkc = try normalizer.nfkc(allocator, source);
defer got_nfkc.deinit();
var graphemes = std.ArrayList(Grapheme).init(allocator);
defer graphemes.deinit();
var iter = try GraphemeIterator.init(got_nfkc.slice);
while (iter.next()) |grapheme| {
try graphemes.append(grapheme);
}
return .{
.allocator = allocator,
.source = try graphemes.toOwnedSlice(),
.line = 1,
.start = 0,
.index = 0,
};
}
pub fn deinit(self: *Self) void {
self.allocator.free(self.source);
}
pub fn scanTokens(self: *Self) ![]const Token {
var list = std.ArrayList(Token).init(self.allocator);
defer {
for (list.items) |token| {
self.allocator.free(token.lexeme);
}
list.deinit();
}
while (!self.isAtEnd()) {
self.start = self.index;
try self.scanToken(&list);
}
try list.append(.{
.token_type = .eof,
.lexeme = "",
.line = self.line,
.literal = null,
});
return list.toOwnedSlice();
}
fn isAtEnd(self: *Self) bool {
return self.index >= self.source.len;
}
fn codepointize(grapheme: Grapheme) !u21 {
return std.unicode.utf8Decode(grapheme.bytes);
}
inline fn peek(self: *Self) !u21 {
if (self.isAtEnd()) {
return 0;
}
return codepointize(self.source[self.index]);
}
fn match(self: *Self, expected: u21) !bool {
if (self.isAtEnd()) return false;
if (try self.peek() != expected) return false;
self.index += 1;
return true;
}
fn advance(self: *Self) !u21 {
const grapheme = self.source[self.index];
self.index += 1;
return codepointize(grapheme);
}
fn createToken(self: *Self, token_type: Token.Type, literal: ?Token.Literal) !Token {
var text = std.ArrayList(u8).init(self.allocator);
defer text.deinit();
for (self.source[self.start..self.index]) |grapheme| {
try text.appendSlice(grapheme.bytes);
}
return .{
.token_type = token_type,
.literal = literal,
.lexeme = try text.toOwnedSlice(),
.line = self.line,
};
}
const KeywordEntry = struct {
kw: Token.Type,
text: []const u8,
};
const keywords = [_]KeywordEntry{
.{ .kw = .kw_var, .text = "var" },
};
fn identOrKeyword(self: *Self, ini: u21) !Token {
var ident = std.ArrayList(u21).init(self.allocator);
defer ident.deinit();
try ident.append(ini);
while (ziglyph.isAlphaNum(try self.peek())) {
try ident.append(try self.advance());
}
var ident_string = std.ArrayList(u8).init(self.allocator);
const writer = ident_string.writer();
defer ident_string.deinit();
for (ident.items) |cp| {
try writer.print("{u}", .{cp});
}
return for (keywords) |kwe| {
if (std.mem.eql(u8, ident_string.items, kwe.text)) {
break try self.createToken(kwe.kw, null);
}
} else try self.createToken(.identifier, null);
}
fn identifyToken(self: *Self, cp: u21) !?Token {
switch (cp) {
'(' => return try self.createToken(.left_paren, null),
')' => return try self.createToken(.right_paren, null),
'{' => return try self.createToken(.left_brace, null),
'}' => return try self.createToken(.right_brace, null),
',' => return try self.createToken(.comma, null),
'.' => return try self.createToken(.dot, null),
'-' => return try self.createToken(.minus, null),
'+' => return try self.createToken(.plus, null),
';' => return try self.createToken(.semicolon, null),
'*' => return try self.createToken(.star, null),
'!' => if (try self.match('=')) {
return try self.createToken(.bang_equal, null);
} else {
return try self.createToken(.bang, null);
},
'=' => if (try self.match('=')) {
return try self.createToken(.equal_equal, null);
} else {
return try self.createToken(.equal, null);
},
'>' => if (try self.match('=')) {
return try self.createToken(.greater_equal, null);
} else {
return try self.createToken(.greater, null);
},
'<' => if (try self.match('=')) {
return try self.createToken(.less_equal, null);
} else {
return try self.createToken(.less, null);
},
'/' => if (try self.match('/')) {
while (try self.peek() != '\n' and !self.isAtEnd()) {
_ = try self.advance();
}
return try self.createToken(.comment, null);
} else {
return try self.createToken(.slash, null);
},
else => {
if (ziglyph.isLetter(cp)) {
return try self.identOrKeyword(cp);
}
return null;
},
}
}
fn scanToken(self: *Self, list: *std.ArrayList(Token)) !void {
var c = try self.advance();
var token = try self.identifyToken(c);
var unknown = std.ArrayList(u21).init(self.allocator);
defer unknown.deinit();
while (token == null) {
try unknown.append(c);
self.start = self.index; // Move up the start of the token (the previous start is unknown)
if (!self.isAtEnd()) {
c = try self.advance();
token = try self.identifyToken(c);
} else {
break;
}
}
if (unknown.items.len > 0) {
var unknown_string = std.ArrayList(u8).init(self.allocator);
const string_writer = unknown_string.writer();
defer unknown_string.deinit();
for (unknown.items) |ucp| {
try string_writer.print("{u}", .{ucp});
}
try Error.report(self.line, "lexer", "Unknown codepoints '{}'", .{std.zig.fmtEscapes(unknown_string.items)});
}
if (token) |atoken| {
try list.append(atoken);
}
}