213 lines
5.3 KiB
Lua
213 lines
5.3 KiB
Lua
|
--[[
|
||
|
lua_lexer_loose.lua.
|
||
|
Loose lexing of Lua code. See README.
|
||
|
|
||
|
WARNING: This code is preliminary and may have errors
|
||
|
in its current form.
|
||
|
|
||
|
(c) 2013 David Manura. MIT License.
|
||
|
--]]
|
||
|
|
||
|
local M = {}
|
||
|
|
||
|
-- based on LuaBalanced
|
||
|
local function match_string(s, pos)
|
||
|
pos = pos or 1
|
||
|
local posa = pos
|
||
|
local c = s:sub(pos,pos)
|
||
|
if c == '"' or c == "'" then
|
||
|
pos = pos + 1
|
||
|
while 1 do
|
||
|
pos = s:find("[" .. c .. "\\]", pos)
|
||
|
if not pos then return s:sub(posa), #s + 1 end -- not terminated string
|
||
|
if s:sub(pos,pos) == c then
|
||
|
local part = s:sub(posa, pos)
|
||
|
return part, pos + 1
|
||
|
else
|
||
|
pos = pos + 2
|
||
|
end
|
||
|
end
|
||
|
else
|
||
|
local sc = s:match("^%[(=*)%[", pos)
|
||
|
if sc then
|
||
|
local _; _, pos = s:find("%]" .. sc .. "%]", pos)
|
||
|
if not pos then return s:sub(posa), #s + 1 end -- not terminated string
|
||
|
local part = s:sub(posa, pos)
|
||
|
return part, pos + 1
|
||
|
else
|
||
|
return nil, pos
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
-- based on LuaBalanced
|
||
|
local function match_comment(s, pos)
|
||
|
pos = pos or 1
|
||
|
if s:sub(pos, pos+1) ~= '--' then
|
||
|
return nil, pos
|
||
|
end
|
||
|
pos = pos + 2
|
||
|
if s:sub(pos,pos) == '[' then
|
||
|
local partt, post = match_string(s, pos)
|
||
|
if partt then
|
||
|
return '--' .. partt, post
|
||
|
end
|
||
|
end
|
||
|
local part; part, pos = s:match('^([^\n]*\n?)()', pos)
|
||
|
return '--' .. part, pos
|
||
|
end
|
||
|
|
||
|
-- note: matches invalid numbers too (for example, 0x)
|
||
|
local function match_numberlike(s, pos)
|
||
|
local hex = s:match('^0[xX]', pos)
|
||
|
if hex then pos = pos + #hex end
|
||
|
|
||
|
local longint = (hex and '^%x+' or '^%d+') .. '[uU]?[lL][lL]'
|
||
|
local mantissa1 = hex and '^%x+%.?%x*' or '^%d+%.?%d*'
|
||
|
local mantissa2 = hex and '^%.%x+' or '^%.%d+'
|
||
|
local exponent = hex and '^[pP][+%-]?%x*' or '^[eE][+%-]?%d*'
|
||
|
local imaginary = '^[iI]'
|
||
|
local tok = s:match(longint, pos)
|
||
|
if not tok then
|
||
|
tok = s:match(mantissa1, pos) or s:match(mantissa2, pos)
|
||
|
if tok then
|
||
|
local tok2 = s:match(exponent, pos + #tok)
|
||
|
if tok2 then tok = tok..tok2 end
|
||
|
tok2 = s:match(imaginary, pos + #tok)
|
||
|
if tok2 then tok = tok..tok2 end
|
||
|
end
|
||
|
end
|
||
|
return tok and (hex or '') .. tok or hex
|
||
|
end
|
||
|
|
||
|
local function newset(s)
|
||
|
local t = {}
|
||
|
for c in s:gmatch'.' do t[c] = true end
|
||
|
return t
|
||
|
end
|
||
|
local function qws(s)
|
||
|
local t = {}
|
||
|
for k in s:gmatch'%S+' do t[k] = true end
|
||
|
return t
|
||
|
end
|
||
|
|
||
|
local sym = newset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
|
||
|
local dig = newset('0123456789')
|
||
|
local name = "([_A-Za-z][_A-Za-z0-9]*)"
|
||
|
local op = newset('=~<>.+-*/%^#=<>;:,.{}[]()')
|
||
|
|
||
|
op['=='] = true
|
||
|
op['<='] = true
|
||
|
op['>='] = true
|
||
|
op['~='] = true
|
||
|
op['..'] = true
|
||
|
op['<<'] = true
|
||
|
op['>>'] = true
|
||
|
op['//'] = true
|
||
|
|
||
|
local is_keyword = qws[[
|
||
|
and break do else elseif end false for function if
|
||
|
in local nil not or repeat return
|
||
|
then true until while goto]]
|
||
|
|
||
|
function M.lex(code, f, pos)
|
||
|
local pos = pos or 1
|
||
|
local tok = code:match('^#![^\n]*\n', pos) -- shebang
|
||
|
if tok then f('Shebang', tok, 1) pos = pos + #tok end
|
||
|
while pos <= #code do
|
||
|
local p2, n2, n1, n3 = code:match('^%s*()((%S)(%S?))', pos)
|
||
|
if not p2 then assert(code:sub(pos):match('^%s*$')); break end
|
||
|
pos = p2
|
||
|
|
||
|
if sym[n1] then
|
||
|
local tok = code:match('^'..name, pos)
|
||
|
assert(tok)
|
||
|
if is_keyword[tok] then
|
||
|
f('Keyword', tok, pos)
|
||
|
else
|
||
|
f('Id', tok, pos)
|
||
|
end
|
||
|
pos = pos + #tok
|
||
|
elseif n2 == '--' then
|
||
|
local tok, pos2 = match_comment(code, pos)
|
||
|
assert(tok)
|
||
|
f('Comment', tok, pos)
|
||
|
pos = pos2
|
||
|
elseif n2 == '::' then
|
||
|
local tok = code:match('^(::%s*'..name..'%s*::)', pos)
|
||
|
if tok then
|
||
|
f('Label', tok, pos)
|
||
|
pos = pos + #tok
|
||
|
else
|
||
|
f('Unknown', code:sub(pos, pos+1), pos) -- unterminated label
|
||
|
pos = pos + 2
|
||
|
end
|
||
|
elseif n1 == '\'' or n1 == '\"' or n2 == '[[' or n2 == '[=' then
|
||
|
local tok = match_string(code, pos)
|
||
|
if tok then
|
||
|
f('String', tok, pos)
|
||
|
pos = pos + #tok
|
||
|
else
|
||
|
f('Unknown', code:sub(pos), pos) -- unterminated string
|
||
|
pos = #code + 1
|
||
|
end
|
||
|
elseif dig[n1] or (n1 == '.' and dig[n3]) then
|
||
|
local tok = match_numberlike(code, pos)
|
||
|
assert(tok)
|
||
|
f('Number', tok, pos)
|
||
|
pos = pos + #tok
|
||
|
elseif op[n2] then
|
||
|
if n2 == '..' and code:match('^%.', pos+2) then
|
||
|
tok = '...'
|
||
|
else
|
||
|
tok = n2
|
||
|
end
|
||
|
f('Keyword', tok, pos)
|
||
|
pos = pos + #tok
|
||
|
elseif op[n1] then
|
||
|
local tok = n1
|
||
|
f('Keyword', tok, pos)
|
||
|
pos = pos + #tok
|
||
|
else
|
||
|
f('Unknown', n1, pos)
|
||
|
pos = pos + 1
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
local Stream = {}
|
||
|
Stream.__index = Stream
|
||
|
function Stream:next(val)
|
||
|
if self._next then
|
||
|
local _next = self._next
|
||
|
self._next = nil
|
||
|
return _next
|
||
|
else
|
||
|
self._next = nil
|
||
|
return self.f()
|
||
|
end
|
||
|
end
|
||
|
function Stream:peek()
|
||
|
if self._next then
|
||
|
return self._next
|
||
|
else
|
||
|
local _next = self.f()
|
||
|
self._next = _next
|
||
|
return _next
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function M.lexc(code, f, pos)
|
||
|
local yield = coroutine.yield
|
||
|
local func = coroutine.wrap(f or function()
|
||
|
M.lex(code, function(tag, name, pos)
|
||
|
-- skip Comment tags as they may arbitrarily split statements and affects their processing
|
||
|
if tag ~= 'Comment' then yield {tag=tag, name, lineinfo=pos} end
|
||
|
end, pos)
|
||
|
yield {tag='Eof', lineinfo = #code+1}
|
||
|
end)
|
||
|
return setmetatable({f=func}, Stream)
|
||
|
end
|
||
|
|
||
|
return M
|