------------------------------------------------------------------------------- -- Copyright (c) 2006-2013 Fabien Fleutot and others. -- -- All rights reserved. -- -- This program and the accompanying materials are made available -- under the terms of the Eclipse Public License v1.0 which -- accompanies this distribution, and is available at -- http://www.eclipse.org/legal/epl-v10.html -- -- This program and the accompanying materials are also made available -- under the terms of the MIT public license which accompanies this -- distribution, and is available at http://www.lua.org/license.html -- -- Contributors: -- Fabien Fleutot - API and implementation -- ------------------------------------------------------------------------------- require 'checks' local M = { } local lexer = { alpha={ }, sym={ } } lexer.__index=lexer lexer.__type='lexer.stream' M.lexer = lexer local debugf = function() end -- local debugf=printf ---------------------------------------------------------------------- -- Some locale settings produce bad results, e.g. French locale -- expect float numbers to use commas instead of periods. -- TODO: change number parser into something loclae-independent, -- locales are nasty. ---------------------------------------------------------------------- os.setlocale('C') local MT = { } M.metatables=MT ---------------------------------------------------------------------- -- Create a new metatable, for a new class of objects. ---------------------------------------------------------------------- local function new_metatable(name) local mt = { __type = 'lexer.'..name }; mt.__index = mt MT[name] = mt end ---------------------------------------------------------------------- -- Position: represent a point in a source file. ---------------------------------------------------------------------- new_metatable 'position' local position_idx=1 function M.new_position(line, column, offset, source) checks('number', 'number', 'number', 'string') local id = position_idx; position_idx = position_idx+1 return setmetatable({line=line, column=column, offset=offset, source=source, id=id}, MT.position) end function MT.position :__tostring() return string.format("<%s%s|L%d|C%d|K%d>", self.comments and "C|" or "", self.source, self.line, self.column, self.offset) end ---------------------------------------------------------------------- -- Position factory: convert offsets into line/column/offset positions. ---------------------------------------------------------------------- new_metatable 'position_factory' function M.new_position_factory(src, src_name) -- assert(type(src)=='string') -- assert(type(src_name)=='string') local lines = { 1 } for offset in src :gmatch '\n()' do table.insert(lines, offset) end local max = #src+1 table.insert(lines, max+1) -- +1 includes Eof return setmetatable({ src_name=src_name, line2offset=lines, max=max }, MT.position_factory) end function MT.position_factory :get_position (offset) -- assert(type(offset)=='number') assert(offset<=self.max) local line2offset = self.line2offset local left = self.last_left or 1 if offset", fli.comments and "C|" or "", fli.source, line, column, offset, lli.comments and "|C" or "") end ---------------------------------------------------------------------- -- Token: atomic Lua language element, with a category, a content, -- and some lineinfo relating it to its original source. ---------------------------------------------------------------------- new_metatable 'token' function M.new_token(tag, content, lineinfo) --printf("TOKEN `%s{ %q, lineinfo = %s} boundaries %d, %d", -- tag, content, tostring(lineinfo), lineinfo.first.id, lineinfo.last.id) return setmetatable({tag=tag, lineinfo=lineinfo, content}, MT.token) end function MT.token :__tostring() --return string.format("`%s{ %q, %s }", self.tag, self[1], tostring(self.lineinfo)) return string.format("`%s %q", self.tag, self[1]) end ---------------------------------------------------------------------- -- Comment: series of comment blocks with associated lineinfo. -- To be attached to the tokens just before and just after them. ---------------------------------------------------------------------- new_metatable 'comment' function M.new_comment(lines) local first = lines[1].lineinfo.first local last = lines[#lines].lineinfo.last local lineinfo = M.new_lineinfo(first, last) return setmetatable({lineinfo=lineinfo, unpack(lines)}, MT.comment) end function MT.comment :text() local last_line = self[1].lineinfo.last.line local acc = { } for i, line in ipairs(self) do local nreturns = line.lineinfo.first.line - last_line table.insert(acc, ("\n"):rep(nreturns)) table.insert(acc, line[1]) end return table.concat(acc) end function M.new_comment_line(text, lineinfo, nequals) checks('string', 'lexer.lineinfo', '?number') return { lineinfo = lineinfo, text, nequals } end ---------------------------------------------------------------------- -- Patterns used by [lexer :extract] to decompose the raw string into -- correctly tagged tokens. ---------------------------------------------------------------------- lexer.patterns = { spaces = "^[ \r\n\t]*()", short_comment = "^%-%-([^\n]*)\n?()", --final_short_comment = "^%-%-([^\n]*)()$", long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()", long_string = "^%[(=*)%[\n?(.-)%]%1%]()", number_longint = "^%d+[uU]?[lL][lL]()", number_longint_hex = "^%x+[uU]?[lL][lL]()", number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" }, number_mantissa_hex = { "^%x+%.?%x*()", "^%x*%.%x+()" }, --Lua5.1 and Lua5.2 number_exponent = "^[eE][%+%-]?%d+()", number_exponent_hex = "^[pP][%+%-]?%d+()", --Lua5.2 number_hex = "^0[xX]()", number_imaginary = "^[iI]()", word = "^([%a_][%w_]*)()", } ---------------------------------------------------------------------- -- unescape a whole string, applying [unesc_digits] and -- [unesc_letter] as many times as required. ---------------------------------------------------------------------- local function unescape_string (s) -- Turn the digits of an escape sequence into the corresponding -- character, e.g. [unesc_digits("123") == string.char(123)]. local function unesc_digits (backslashes, digits) if #backslashes%2==0 then -- Even number of backslashes, they escape each other, not the digits. -- Return them so that unesc_letter() can treat them return backslashes..digits else -- Remove the odd backslash, which escapes the number sequence. -- The rest will be returned and parsed by unesc_letter() backslashes = backslashes :sub (1,-2) end local k, j, i = digits :reverse() :byte(1, 3) local z = string.byte "0" local code = (k or z) + 10*(j or z) + 100*(i or z) - 111*z if code > 255 then error ("Illegal escape sequence '\\"..digits.. "' in string: ASCII codes must be in [0..255]") end local c = string.char (code) if c == '\\' then c = '\\\\' end -- parsed by unesc_letter (test: "\092b" --> "\\b") return backslashes..c end -- Turn hex digits of escape sequence into char. local function unesc_hex(backslashes, digits) if #backslashes%2==0 then return backslashes..'x'..digits else backslashes = backslashes :sub (1,-2) end local c = string.char(tonumber(digits,16)) if c == '\\' then c = '\\\\' end -- parsed by unesc_letter (test: "\x5cb" --> "\\b") return backslashes..c end -- Handle Lua 5.2 \z sequences local function unesc_z(backslashes, more) if #backslashes%2==0 then return backslashes..more else return backslashes :sub (1,-2) end end -- Take a letter [x], and returns the character represented by the -- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"]. local function unesc_letter(x) local t = { a = "\a", b = "\b", f = "\f", n = "\n", r = "\r", t = "\t", v = "\v", ["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" } return t[x] or x end s = s: gsub ("(\\+)(z%s*)", unesc_z) -- Lua 5.2 s = s: gsub ("(\\+)([0-9][0-9]?[0-9]?)", unesc_digits) s = s: gsub ("(\\+)x([0-9a-fA-F][0-9a-fA-F])", unesc_hex) -- Lua 5.2 s = s: gsub ("\\(%D)",unesc_letter) return s end lexer.extractors = { "extract_long_comment", "extract_short_comment", "extract_short_string", "extract_word", "extract_number", "extract_long_string", "extract_symbol" } ---------------------------------------------------------------------- -- Really extract next token from the raw string -- (and update the index). -- loc: offset of the position just after spaces and comments -- previous_i: offset in src before extraction began ---------------------------------------------------------------------- function lexer :extract () local attached_comments = { } local function gen_token(...) local token = M.new_token(...) if #attached_comments>0 then -- attach previous comments to token local comments = M.new_comment(attached_comments) token.lineinfo.first.comments = comments if self.lineinfo_last_extracted then self.lineinfo_last_extracted.comments = comments end attached_comments = { } end token.lineinfo.first.facing = self.lineinfo_last_extracted self.lineinfo_last_extracted.facing = assert(token.lineinfo.first) self.lineinfo_last_extracted = assert(token.lineinfo.last) return token end while true do -- loop until a non-comment token is found -- skip whitespaces self.i = self.src:match (self.patterns.spaces, self.i) if self.i>#self.src then local fli = self.posfact :get_position (#self.src+1) local lli = self.posfact :get_position (#self.src+1) -- ok? local tok = gen_token("Eof", "eof", M.new_lineinfo(fli, lli)) tok.lineinfo.last.facing = lli return tok end local i_first = self.i -- loc = position after whitespaces -- try every extractor until a token is found for _, extractor in ipairs(self.extractors) do local tag, content, xtra = self [extractor] (self) if tag then local fli = self.posfact :get_position (i_first) local lli = self.posfact :get_position (self.i-1) local lineinfo = M.new_lineinfo(fli, lli) if tag=='Comment' then local prev_comment = attached_comments[#attached_comments] if not xtra -- new comment is short and prev_comment and not prev_comment[2] -- prev comment is short and prev_comment.lineinfo.last.line+1==fli.line then -- adjascent lines -- concat with previous comment prev_comment[1] = prev_comment[1].."\n"..content -- TODO quadratic, BAD! prev_comment.lineinfo.last = lli else -- accumulate comment local comment = M.new_comment_line(content, lineinfo, xtra) table.insert(attached_comments, comment) end break -- back to skipping spaces else -- not a comment: real token, then return gen_token(tag, content, lineinfo) end -- if token is a comment end -- if token found end -- for each extractor end -- while token is a comment end -- :extract() ---------------------------------------------------------------------- -- Extract a short comment. ---------------------------------------------------------------------- function lexer :extract_short_comment() -- TODO: handle final_short_comment local content, j = self.src :match (self.patterns.short_comment, self.i) if content then self.i=j; return 'Comment', content, nil end end ---------------------------------------------------------------------- -- Extract a long comment. ---------------------------------------------------------------------- function lexer :extract_long_comment() local equals, content, j = self.src:match (self.patterns.long_comment, self.i) if j then self.i = j; return "Comment", content, #equals end end ---------------------------------------------------------------------- -- Extract a '...' or "..." short string. ---------------------------------------------------------------------- function lexer :extract_short_string() local k = self.src :sub (self.i,self.i) -- first char if k~=[[']] and k~=[["]] then return end -- no match' local i = self.i + 1 local j = i while true do local x,y; x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j) -- next interesting char if x == '\\' then if y == 'z' then -- Lua 5.2 \z j = self.src :match ("^%s*()", j+1) else j=j+1 -- escaped char end elseif x == k then break -- end of string else assert (not x or x=='\r' or x=='\n') return nil, 'Unterminated string' end end self.i = j return 'String', unescape_string (self.src :sub (i,j-2)) end ---------------------------------------------------------------------- -- Extract Id or Keyword. ---------------------------------------------------------------------- function lexer :extract_word() local word, j = self.src:match (self.patterns.word, self.i) if word then self.i = j return (self.alpha [word] and 'Keyword' or 'Id'), word end end ---------------------------------------------------------------------- -- Extract Number. ---------------------------------------------------------------------- function lexer :extract_number() local patt = self.patterns local s = self.src local j = s:match(patt.number_hex, self.i) local hex = j ~= nil local longint = hex and patt.number_longint_hex or patt.number_longint local mantissa1 = hex and patt.number_mantissa_hex[1] or patt.number_mantissa[1] local mantissa2 = hex and patt.number_mantissa_hex[2] or patt.number_mantissa[2] local exponent = hex and patt.number_exponent_hex or patt.number_exponent if not hex then j = self.i end local t = s:match(longint, j) if t then j = t else j = s:match(mantissa1, j) or s:match(mantissa2, j) if not j then return end j = s:match(exponent, j) or j j = s:match(patt.number_imaginary, j) or j end local str = self.src:sub (self.i, j-1) self.i = j -- Number found, interpret with tonumber() and return it -- return str as the fallback when processing formats not supported by the current interpreter return 'Number', (tonumber (str) or str) end ---------------------------------------------------------------------- -- Extract long string. ---------------------------------------------------------------------- function lexer :extract_long_string() local _, content, j = self.src :match (self.patterns.long_string, self.i) if j then self.i = j; return 'String', content end end ---------------------------------------------------------------------- -- Extract symbol. ---------------------------------------------------------------------- function lexer :extract_symbol() local k = self.src:sub (self.i,self.i) local symk = self.sym [k] -- symbols starting with `k` if not symk then self.i = self.i + 1 return 'Keyword', k end for _, sym in pairs (symk) do if sym == self.src:sub (self.i, self.i + #sym - 1) then self.i = self.i + #sym return 'Keyword', sym end end self.i = self.i+1 return 'Keyword', k end ---------------------------------------------------------------------- -- Add a keyword to the list of keywords recognized by the lexer. ---------------------------------------------------------------------- function lexer :add (w, ...) assert(not ..., "lexer :add() takes only one arg, although possibly a table") if type (w) == "table" then for _, x in ipairs (w) do self :add (x) end else if w:match (self.patterns.word .. "$") then self.alpha [w] = true elseif w:match "^%p%p+$" then local k = w:sub(1,1) local list = self.sym [k] if not list then list = { }; self.sym [k] = list end table.insert (list, w) elseif w:match "^%p$" then return else error "Invalid keyword" end end end ---------------------------------------------------------------------- -- Return the [n]th next token, without consuming it. -- [n] defaults to 1. If it goes pass the end of the stream, an EOF -- token is returned. ---------------------------------------------------------------------- function lexer :peek (n) if not n then n=1 end if n > #self.peeked then for i = #self.peeked+1, n do self.peeked [i] = self :extract() end end return self.peeked [n] end ---------------------------------------------------------------------- -- Return the [n]th next token, removing it as well as the 0..n-1 -- previous tokens. [n] defaults to 1. If it goes pass the end of the -- stream, an EOF token is returned. ---------------------------------------------------------------------- function lexer :next (n) n = n or 1 self :peek (n) local a for i=1,n do a = table.remove (self.peeked, 1) -- TODO: is this used anywhere? I think not. a.lineinfo.last may be nil. --self.lastline = a.lineinfo.last.line end self.lineinfo_last_consumed = a.lineinfo.last return a end ---------------------------------------------------------------------- -- Returns an object which saves the stream's current state. ---------------------------------------------------------------------- -- FIXME there are more fields than that to save function lexer :save () return { self.i; {unpack(self.peeked) } } end ---------------------------------------------------------------------- -- Restore the stream's state, as saved by method [save]. ---------------------------------------------------------------------- -- FIXME there are more fields than that to restore function lexer :restore (s) self.i=s[1]; self.peeked=s[2] end ---------------------------------------------------------------------- -- Resynchronize: cancel any token in self.peeked, by emptying the -- list and resetting the indexes ---------------------------------------------------------------------- function lexer :sync() local p1 = self.peeked[1] if p1 then local li_first = p1.lineinfo.first if li_first.comments then li_first=li_first.comments.lineinfo.first end self.i = li_first.offset self.column_offset = self.i - li_first.column self.peeked = { } self.attached_comments = p1.lineinfo.first.comments or { } end end ---------------------------------------------------------------------- -- Take the source and offset of an old lexer. ---------------------------------------------------------------------- function lexer :takeover(old) self :sync(); old :sync() for _, field in ipairs{ 'i', 'src', 'attached_comments', 'posfact' } do self[field] = old[field] end return self end ---------------------------------------------------------------------- -- Return the current position in the sources. This position is between -- two tokens, and can be within a space / comment area, and therefore -- have a non-null width. :lineinfo_left() returns the beginning of the -- separation area, :lineinfo_right() returns the end of that area. -- -- ____ last consummed token ____ first unconsummed token -- / / -- XXXXX YYYYY -- \____ \____ -- :lineinfo_left() :lineinfo_right() ---------------------------------------------------------------------- function lexer :lineinfo_right() return self :peek(1).lineinfo.first end function lexer :lineinfo_left() return self.lineinfo_last_consumed end ---------------------------------------------------------------------- -- Create a new lexstream. ---------------------------------------------------------------------- function lexer :newstream (src_or_stream, name) name = name or "?" if type(src_or_stream)=='table' then -- it's a stream return setmetatable ({ }, self) :takeover (src_or_stream) elseif type(src_or_stream)=='string' then -- it's a source string local src = src_or_stream local pos1 = M.new_position(1, 1, 1, name) local stream = { src_name = name; -- Name of the file src = src; -- The source, as a single string peeked = { }; -- Already peeked, but not discarded yet, tokens i = 1; -- Character offset in src attached_comments = { },-- comments accumulator lineinfo_last_extracted = pos1, lineinfo_last_consumed = pos1, posfact = M.new_position_factory (src_or_stream, name) } setmetatable (stream, self) -- Skip initial sharp-bang for Unix scripts -- FIXME: redundant with mlp.chunk() if src and src :match "^#!" then local endofline = src :find "\n" stream.i = endofline and (endofline + 1) or #src end return stream else assert(false, ":newstream() takes a source string or a stream, not a ".. type(src_or_stream)) end end ---------------------------------------------------------------------- -- If there's no ... args, return the token a (whose truth value is -- true) if it's a `Keyword{ }, or nil. If there are ... args, they -- have to be strings. if the token a is a keyword, and it's content -- is one of the ... args, then returns it (it's truth value is -- true). If no a keyword or not in ..., return nil. ---------------------------------------------------------------------- function lexer :is_keyword (a, ...) if not a or a.tag ~= "Keyword" then return false end local words = {...} if #words == 0 then return a[1] end for _, w in ipairs (words) do if w == a[1] then return w end end return false end ---------------------------------------------------------------------- -- Cause an error if the next token isn't a keyword whose content -- is listed among ... args (which have to be strings). ---------------------------------------------------------------------- function lexer :check (...) local words = {...} local a = self :next() local function err () error ("Got " .. tostring (a) .. ", expected one of these keywords : '" .. table.concat (words,"', '") .. "'") end if not a or a.tag ~= "Keyword" then err () end if #words == 0 then return a[1] end for _, w in ipairs (words) do if w == a[1] then return w end end err () end ---------------------------------------------------------------------- -- ---------------------------------------------------------------------- function lexer :clone() local alpha_clone, sym_clone = { }, { } for word in pairs(self.alpha) do alpha_clone[word]=true end for letter, list in pairs(self.sym) do sym_clone[letter] = { unpack(list) } end local clone = { alpha=alpha_clone, sym=sym_clone } setmetatable(clone, self) clone.__index = clone return clone end ---------------------------------------------------------------------- -- Cancel everything left in a lexer, all subsequent attempts at -- `:peek()` or `:next()` will return `Eof`. ---------------------------------------------------------------------- function lexer :kill() self.i = #self.src+1 self.peeked = { } self.attached_comments = { } self.lineinfo_last = self.posfact :get_position (#self.src+1) end return M