I asked on the lua-l mailing list, and this seems like the cleanest approach.
QM5V4K7DTRL24Y6HQXJXEZJLLJLO7ICVCOZW2AT76XATBUGA5WOAC -- Return the first word, after skipping Lua comments.if not buf:match('^%s*%-%-') thenreturn buf:match('^%s*(%S+)') -- avoid unnecessary allocations
return first_noncomment_word(buf)end-- return the first word (separated by whitespace) that's not in a Lua comment-- or empty string if there's nothing-- ignore strings; we don't expect them to be the first word in a programfunction first_noncomment_word(str)local pos = 1while pos <= #str do -- not Unicode-aware; hopefully it doesn't need to beif str:sub(pos,pos) == '-' then-- skip any commentsif str:sub(pos+1,pos+1) == '-' then-- definitely start of a commentlocal long_comment_header = str:match('^%[=*%[', pos+2)if long_comment_header then-- long commentlocal long_comment_trailer = long_comment_header:gsub('%[', ']')pos = str:find(long_comment_trailer, pos, --[[plain]]true)if pos == nil then return '' end -- incomplete comment; no first wordpos = pos + #long_comment_trailerelse-- line commentpos = str:find('\n', pos)if pos == nil then return '' end -- incomplete comment; no first wordendendend-- any non-whitespace that's not a comment is the first wordif str:sub(pos,pos):match('%s') thenpos = pos+1elsereturn str:match('^%S*', pos)end
-- wastefully strip out comments everywhere, not just at the startbuf = buf:gsub('%-%-[^\n]*', '') -- line commentsreturn buf:match('^%s*(%S+)')
return ''endfunction test_first_noncomment_word()check_eq(first_noncomment_word(''), '', 'empty string')check_eq(first_noncomment_word('abc'), 'abc', 'single word')check_eq(first_noncomment_word('abc def'), 'abc', 'stop at space')check_eq(first_noncomment_word('abc\tdef'), 'abc', 'stop at tab')check_eq(first_noncomment_word('abc\ndef'), 'abc', 'stop at newline')check_eq(first_noncomment_word('-- abc\ndef'), 'def', 'ignore line comment')check_eq(first_noncomment_word('--[[abc]] def'), 'def', 'ignore block comment')check_eq(first_noncomment_word('--[[abc\n]] def'), 'def', 'ignore multi-line block comment')check_eq(first_noncomment_word('--[[abc\n--]] def'), 'def', 'ignore comment leader before block comment trailer')check_eq(first_noncomment_word('--[=[abc]=] def'), 'def', 'ignore long comment')check_eq(first_noncomment_word('--[=[abc]] def ]=] ghi'), 'ghi', 'ignore long comment containing block comment trailer')check_eq(first_noncomment_word('--[===[abc\n\ndef ghi\njkl]===]mno\npqr'), 'mno', 'ignore long comment containing block comment trailer')check_eq(first_noncomment_word('-'), '-', 'incomplete comment token')check_eq(first_noncomment_word('--abc'), '', 'incomplete line comment')check_eq(first_noncomment_word('--abc\n'), '', 'just a line comment')check_eq(first_noncomment_word('--abc\n '), '', 'just a line comment 2')check_eq(first_noncomment_word('--[ab\n'), '', 'incomplete block comment token is a line comment')check_eq(first_noncomment_word('--[[ab'), '', 'incomplete block comment')check_eq(first_noncomment_word('--[[ab\n]'), '', 'incomplete block comment 2')check_eq(first_noncomment_word('--[=[ab\n]] ]='), '', 'incomplete block comment 3')check_eq(first_noncomment_word('--[=[ab\n]] ]=]'), '', 'just a block comment')check_eq(first_noncomment_word('--[=[ab\n]] ]=] \n \n '), '', 'just a block comment 2')