I asked on the lua-l mailing list, and this seems like the cleanest approach.
OZJDZHUOT7B3CKUUQXRTFHBX6AJOOARG26UBRT5ZYBNRIVWW3OTQC
-- Return the first word, after skipping Lua comments.
if not buf:match('^%s*%-%-') then
return buf:match('^%s*(%S+)') -- avoid unnecessary allocations
return first_noncomment_word(buf)
end
-- return the first word (separated by whitespace) that's not in a Lua comment
-- or empty string if there's nothing
-- ignore strings; we don't expect them to be the first word in a program
function first_noncomment_word(str)
local pos = 1
while pos <= #str do -- not Unicode-aware; hopefully it doesn't need to be
if str:sub(pos,pos) == '-' then
-- skip any comments
if str:sub(pos+1,pos+1) == '-' then
-- definitely start of a comment
local long_comment_header = str:match('^%[=*%[', pos+2)
if long_comment_header then
-- long comment
local long_comment_trailer = long_comment_header:gsub('%[', ']')
pos = str:find(long_comment_trailer, pos, --[[plain]]true)
if pos == nil then return '' end -- incomplete comment; no first word
pos = pos + #long_comment_trailer
else
-- line comment
pos = str:find('\n', pos)
if pos == nil then return '' end -- incomplete comment; no first word
end
end
end
-- any non-whitespace that's not a comment is the first word
if str:sub(pos,pos):match('%s') then
pos = pos+1
else
return str:match('^%S*', pos)
end
-- wastefully strip out comments everywhere, not just at the start
buf = buf:gsub('%-%-[^\n]*', '') -- line comments
return buf:match('^%s*(%S+)')
return ''
end
function test_first_noncomment_word()
check_eq(first_noncomment_word(''), '', 'empty string')
check_eq(first_noncomment_word('abc'), 'abc', 'single word')
check_eq(first_noncomment_word('abc def'), 'abc', 'stop at space')
check_eq(first_noncomment_word('abc\tdef'), 'abc', 'stop at tab')
check_eq(first_noncomment_word('abc\ndef'), 'abc', 'stop at newline')
check_eq(first_noncomment_word('-- abc\ndef'), 'def', 'ignore line comment')
check_eq(first_noncomment_word('--[[abc]] def'), 'def', 'ignore block comment')
check_eq(first_noncomment_word('--[[abc\n]] def'), 'def', 'ignore multi-line block comment')
check_eq(first_noncomment_word('--[[abc\n--]] def'), 'def', 'ignore comment leader before block comment trailer')
check_eq(first_noncomment_word('--[=[abc]=] def'), 'def', 'ignore long comment')
check_eq(first_noncomment_word('--[=[abc]] def ]=] ghi'), 'ghi', 'ignore long comment containing block comment trailer')
check_eq(first_noncomment_word('--[===[abc\n\ndef ghi\njkl]===]mno\npqr'), 'mno', 'ignore long comment containing block comment trailer')
check_eq(first_noncomment_word('-'), '-', 'incomplete comment token')
check_eq(first_noncomment_word('--abc'), '', 'incomplete line comment')
check_eq(first_noncomment_word('--abc\n'), '', 'just a line comment')
check_eq(first_noncomment_word('--abc\n '), '', 'just a line comment 2')
check_eq(first_noncomment_word('--[ab\n'), '', 'incomplete block comment token is a line comment')
check_eq(first_noncomment_word('--[[ab'), '', 'incomplete block comment')
check_eq(first_noncomment_word('--[[ab\n]'), '', 'incomplete block comment 2')
check_eq(first_noncomment_word('--[=[ab\n]] ]='), '', 'incomplete block comment 3')
check_eq(first_noncomment_word('--[=[ab\n]] ]=]'), '', 'just a block comment')
check_eq(first_noncomment_word('--[=[ab\n]] ]=] \n \n '), '', 'just a block comment 2')