# christian neukirchen, 2007 # mit-licensed class RPeg begin require 'rpeg.parser.rb' rescue LoadError end class ParseError < RuntimeError end def initialize(input="") @input = input @cursor = 0 @stack = [] @matches = {} @value = :undef @definition_order = [] @nonterminals = {} end def self.load(grammar) raise "RPeg not bootstrapped" unless defined? RPeg::PEGParser parser = PEGParser.new parser.parse(grammar) end attr_accessor :input attr_reader :matches attr_reader :cursor attr_reader :value def reset(input=@input) @input = input @cursor = 0 @stack = [] end # consider inlining these def getc ch = @input[@cursor] @cursor += 1 ch end def push @stack << @cursor end def restore @cursor = @stack.pop :fail end def commit @stack.pop :succeed end def call(args) case args when Array __send__(*args) when Proc args.call(self) :succeed end end # ----------------- def define(name, term) @definition_order << name.to_s @nonterminals[name.to_s] = term end def parse(string=@input, name="start") reset string method = :"nt_#{name}" if respond_to? method r = __send__(method) else r = nonterminal(name) end case r when :succeed @value when :fail raise RPeg::ParseError, "parse error, buffer: #{@input[0...@cursor]}" end end def literal(string) push string.each_byte { |b| if getc != b return restore end } commit end def eof push if getc != nil restore else commit end end def dot push if (c=getc) == nil restore else commit end end def sequence(*items) push items.each { |item| if call(item) == :fail return restore end } commit end def choice(*items) push items.each { |item| case call(item) when :succeed return commit end } restore end def epsilon :succeed end def once(term) choice term, [:epsilon] end def zero_or_more(term) loop { if call(term) == :fail return :succeed end } end def one_or_more(term) sequence term, [:zero_or_more, term] end def charclass(chars) push ch = getc if ch && ch.chr =~ Regexp.compile(/\A[#{chars}]\z/m) commit else restore end end def nonterminal(name) begin term = @nonterminals.fetch(name.to_s) rescue IndexError raise "undefined non-terminal: #{name}" else m, @matches = @matches, {} r = call(term) if @matches.include? :value @value = @matches[:value] end @matches = m r end end def assert(term) push r = call(term) restore r end def assert_not(term) push r = call(term) restore case r when :fail :succeed when :succeed :fail end end def capture(name, term) name = name.to_sym c, v = @cursor, @value @value = :undef if call(term) == :succeed if @value != :undef @matches[name] = @value else @matches[name] = @input[c...@cursor] end @value = v :succeed else @value = v :fail end end def code(source) instance_eval source :success end def pretty_print @definition_order.map { |name| "#{name} <- #{pretty_term(@nonterminals[name])};\n" }.join end def pretty_term(term) return "" if term.kind_of? Proc return "????" unless term.kind_of? Array case term.first when :nonterminal term[1].to_s when :sequence term[1..-1].map { |t| pretty_term t }.join(" ") when :choice term[1..-1].map { |t| pretty_term t }.join(" / ") when :literal term[1].dump when :charclass "[" + term[1].dump[1...-1] + "]" when :eof "%eof" when :epsilon "%epsilon" when :zero_or_more pretty_paren(term[1]) + "*" when :one_or_more pretty_paren(term[1]) + "+" when :once pretty_paren(term[1]) + "?" when :assert "&" + pretty_paren(term[1]) when :assert_not "!" + pretty_paren(term[1]) when :dot ". " when :capture term[1].to_s + ":" + pretty_paren(term[2]) when :code "{{#{term[1]}}}" else raise "can't pretty print #{term.first}" end end def pretty_paren(term) return "????" unless term.kind_of? Array case term.first when :sequence, :choice, :zero_or_more, :one_or_more, :once, :assert, :assert_not, :capture "(" + pretty_term(term) + ")" when :nonterminal, :literal, :charclass, :eof, :epsilon, :dot, :code pretty_term(term) else raise "don't know whether to parenthesize #{term.first}" end end def genid(prefix="id") @genid ||= 0 @genid += 1 "%s%04d" % [prefix, @genid] end def compile2ruby(klass="Parser") s = [] s << "class #{klass}" @definition_order.map { |name| r = genid "r" m = genid "m" s << " def nt_#{name}" # s << " puts '#{name}'" s << " @matches, #{m} = {}, @matches" s << " #{r} = " + compile2ruby_term(@nonterminals[name]) s << " if @matches.include? :value" s << " @value = @matches[:value]" s << " end" s << " @matches = #{m}" s << " #{r}" s << " end" } s << "end\n" s.join("\n") end def compile2ruby_term(term) return "" if term.kind_of? Proc return "????" unless term.kind_of? Array case term.first when :nonterminal "nt_" + term[1].to_s when :sequence return compile2ruby_term(term[1]) if term.size == 2 r = genid "r" cur = genid "cur" s = [] s << "begin" s << " #{cur} = @cursor" s << " #{r} = :succeed" term[1..-1].each { |item| s << "if #{r} != :fail" s << " #{r} = " + compile2ruby_term(item) s << "else" s << " @cursor = #{cur}" s << "end" } s << " #{r}" s << "end" s.join("\n") when :choice return compile2ruby_term(term[1]) if term.size == 2 r = genid "r" cur = genid "cur" s = [] s << "begin" s << " #{cur} = @cursor" s << " #{r} = :fail" term[1..-1].each { |item| s << " #{r} = " + compile2ruby_term(item) s << " if #{r} == :fail" s << " @cursor = #{cur}" } term[1..-1].each { |item| s << "end" } s << " #{r}" s << "end" s.join("\n") when :literal r = genid "r" s = [] s << "begin" s << " #{r} = :succeed" term[1].each_byte { |b| s << "if #{r} != :fail && @input[@cursor] != #{b}" s << " #{r} = :fail" s << "else" s << " @cursor += 1" s << "end" } s << " #{r}" s << "end" s.join("\n") when :charclass ch = genid "ch" s = [] s << "begin" s << " #{ch} = @input[@cursor]" s << " if #{ch} && #{ch}.chr =~ /\\A[#{term[1].dump[1...-1]}]\\z/m" s << " @cursor += 1" s << " :succeed" s << " else" s << " :fail" s << " end" s << "end" s.join("\n") when :eof "(@input[@cursor]==nil ? :succeed : :fail)" when :epsilon "(:succeed)" when :zero_or_more r = genid "r" s = [] s << "loop {" s << " #{r} = " + compile2ruby_term(term[1]) s << " break :succeed if #{r} == :fail" s << "}" s.join("\n") when :one_or_more compile2ruby_term([:sequence, term[1], [:zero_or_more, term[1]]]) when :once compile2ruby_term([:choice, term[1], [:epsilon]]) when :assert r = genid "r" cur = genid "cur" s = [] s << "begin" s << " #{cur} = @cursor" s << " #{r} = " + compile2ruby_term(term[1]) s << " @cursor = #{cur}" s << " #{r}" s << "end" s.join("\n") when :assert_not r = genid "r" cur = genid "cur" s = [] s << "begin" s << " #{cur} = @cursor" s << " #{r} = " + compile2ruby_term(term[1]) s << " @cursor = #{cur}" s << " #{r} == :fail ? :succeed : :fail" s << "end" s.join("\n") when :dot s = [] s << "if @input[@cursor] == nil" s << " :fail" s << "else" s << " @cursor += 1" s << " :succeed" s << "end" s.join("\n") when :capture name = term[1].to_s r = genid "r" c = genid "c" v = genid "v" s = [] s << "begin" s << " #{c}, #{v} = @cursor, @value" s << " @value = :undef" s << " #{r} = " + compile2ruby_term(term[2]) s << " if #{r} == :succeed" s << " if @value != :undef" s << " @matches[:#{name}] = @value" s << " else" s << " @matches[:#{name}] = @input[#{c}...@cursor]" s << " end" s << " @value = #{v}" s << " :succeed" s << " else" s << " @value = #{v}" s << " :fail" s << " end" s << "end" s.join("\n") when :code "(#{term[1]}; :succeed)" else raise "can't compile #{term.first}" end end def compile2c(modname="parser") first_rule = @definition_order.first s = [] s << <cursor) #define SIZE (RSTRING(arg->string)->len) #define AT(x) (RSTRING(arg->string)->ptr[x]) struct arg { VALUE string; VALUE object; VALUE matches; VALUE value; long cursor; }; BOILERPLATE @definition_order.map { |name| s << "rule(nt_#{name});" } s << <matches;" s << " VALUE v = arg->value;" s << " int r;" s << " arg->matches = rb_hash_new();" s << " r = #{func}(arg);" s << " if (st_lookup(RHASH(arg->matches)->tbl, ID2SYM(rb_intern(\"value\")), 0))" s << " arg->value = rb_hash_aref(arg->matches, ID2SYM(rb_intern(\"value\")));" s << " arg->matches = m;" s << " return r;" s << "}" } s << <string)->ptr[arg->cursor] != #{byte}) { CURSOR = c; fail; }" s << " CURSOR++;" } s << " succeed;" s << "}" [f, s.join("\n")] when :charclass f = genid "f" s = [] s << "rule(#{f}) {" s << " int ch;" s << " if (CURSOR >= SIZE) fail;" s << " ch = RSTRING(arg->string)->ptr[arg->cursor];" s << " if (" term[1].scan(/((.)-(.)|(.))/m) { if $2 s << "(ch >= #{$2[0]} && ch <= #{$3[0]}) ||" else s << "(ch == #{$4[0]}) ||" end } s << " 0)" s << " { CURSOR++; succeed; } else fail;" s << "}" [f, s.join("\n")] when :eof f = genid "f" s = [] s << "rule(#{f}) {" s << " if (CURSOR < SIZE) fail; else succeed;" s << "}" [f, s.join("\n")] when :epsilon f = genid "f" s = [] s << "rule(#{f}) {" s << " succeed;" s << "}" [f, s.join("\n")] when :zero_or_more f = genid "f" func, code = compile2c_term(term[1]) s = [] s << code s << "rule(#{f}) {" s << " while(#{func}(arg));" s << " succeed;" s << "}" [f, s.join("\n")] when :one_or_more f = genid "f" func, code = compile2c_term(term[1]) s = [] s << code s << "rule(#{f}) {" s << " if (!#{func}(arg)) fail;" s << " while(#{func}(arg));" s << " succeed;" s << "}" [f, s.join("\n")] when :once f = genid "f" func, code = compile2c_term(term[1]) s = [] s << code s << "rule(#{f}) {" s << " #{func}(arg);" s << " succeed;" s << "}" [f, s.join("\n")] when :assert f = genid "f" func, code = compile2c_term(term[1]) s = [] s << code s << "rule(#{f}) {" s << " int c = CURSOR;" s << " if (!#{func}(arg)) fail;" s << " CURSOR = c;" s << " succeed;"; s << "}" [f, s.join("\n")] when :assert_not f = genid "f" func, code = compile2c_term(term[1]) s = [] s << code s << "rule(#{f}) {" s << " int c = CURSOR;" s << " if (!#{func}(arg)) succeed;" s << " CURSOR = c;" s << " fail;"; s << "}" [f, s.join("\n")] when :dot f = genid "f" s = [] s << "rule(#{f}) {" s << " if (CURSOR < SIZE) { CURSOR++; succeed; } else fail;" s << "}" [f, s.join("\n")] when :capture f = genid "f" func, code = compile2c_term(term[2]) name = term[1].to_s s = [] s << code s << "rule(#{f}) {" s << " int c = CURSOR;" s << " VALUE v = arg->value;" s << " arg->value = ID2SYM(rb_intern(\"undef\"));" s << " if (#{func}(arg)) {" s << " if (arg->value != ID2SYM(rb_intern(\"undef\")))" s << " rb_hash_aset(arg->matches, ID2SYM(rb_intern(#{name.dump})), arg->value);" s << " else" s << " rb_hash_aset(arg->matches, ID2SYM(rb_intern(#{name.dump})), rb_str_new(RSTRING(arg->string)->ptr+c, CURSOR-c));" s << " arg->value = v;" s << " succeed;" s << " } else {" s << " arg->value = v;" s << " fail;" s << " };" s << "}" [f, s.join("\n")] when :code f = genid "f" s = [] s << "rule(#{f}) {" s << " rb_iv_set(arg->object, \"@matches\", arg->matches);" s << " rb_iv_set(arg->object, \"@value\", arg->value);" s << " rb_funcall(arg->object, rb_intern(\"instance_eval\"), 3, rb_str_new2(#{term[1].dump}), rb_str_new2(__FILE__), INT2FIX(__LINE__));" s << " arg->matches = rb_iv_get(arg->object, \"@matches\");" s << " arg->value = rb_iv_get(arg->object, \"@value\");" s << " succeed;" s << "}" [f, s.join("\n")] else raise "can't compile #{term.first}" end end def unescape(s) self.class.unescape(s) end def self.unescape(s) s.gsub(/\\(.)/) { $1.tr(%Q{nrtf'"\\}, "\n\r\t\f'\"\\") } end end