id:rahaema:20080702, id:rahaema:20080703 の続きです。
id:rahaema:20080702 では、「正規表現をパースして構文木を作る」。id:rahaema:20080703 では、「構文木からNFAを作成する」まででしたが、ようやく、「NFAからDFAを作成する」までたどり着くことができました。
前回同様、場当たり的で、見るに耐えないものになってしまいましたが、正規表現がどのように実現されているのかというのは自分なりに理解を深めることができたと思います。
regexp.rb
require 'pp' class Scanner def initialize(reg) @reg = reg @list = [] end def createTokenList @reg.split(//).each do |char| case char when "|" @list << Token.new(Token::Union) when "(" @list << Token.new(Token::Lpar) when ")" @list << Token.new(Token::Rpar) when "*" @list << Token.new(Token::Star) when "+" @list << Token.new(Token::Plus) else @list << Token.new(Token::Char, char) end end @list << Token.new(Token::End) return @list end end class Parser # <regexp> ::= <term> | <term> '|' <regexp> # <term> ::= EMPTY | <factor> <term> # <factor> ::= <primary> | <primary>'*' | <primary>'+' # <primary>::= CHARACTER | '(' <regexp> ')' def initialize(tokenList) @tokenList = tokenList @current = nil end def parseRegexp nextToken t = regexp if @current.type != "End" raise SyntaxError end return t end def regexp x = term while @current.type == "Union" nextToken x = Node.new("op_union", x, term, nil) end return x end def term if @current.type == "Union" || @current.type == "Rpar" || @current.type == "End" x = Node.new("op_empty", nil, nil, nil) else x = factor while @current.type != "Union" && @current.type != "Rpar" && @current.type != "End" x = Node.new("op_concat", x, factor, nil) end end return x end def factor x = primary if @current.type == "Star" x = Node.new("op_closure", x, nil, nil) nextToken elsif @current.type == "Plus" x = Node.new("op_concat", x, Node.new("op_closure", x, nil, nil), nil) nextToken end return x end def primary if @current.type == "Char" x = Node.new("op_char", nil, nil, @current.char) nextToken elsif @current.type == "Lpar" nextToken x = regexp if @current.type != "Rpar" raise SyntaxError end nextToken else raise SyntaxError end return x end def nextToken @current = @tokenList.shift end class SyntaxError < StandardError; end end class Node attr_reader :op, :left, :right, :char def initialize(op, left, right, char) @op = op @left = left @right = right @char = char end end class Tree2Nfa EMPTY = -1 attr_reader :tree, :nfa_nstate, :nfa def initialize(tree) @tree = tree @nfa_nstate = 0 @nfa = [] end def build nfa_entry = genNode nfa_exit = genNode genNfa(@tree, nfa_entry, nfa_exit) return @nfa end def genNode retval = @nfa_nstate @nfa_nstate += 1 return retval end def addTransition(from, to, c) p = NList.new(c, to) p.nextn = @nfa[from] @nfa[from] = p end def genNfa(tree, entry, wayOut) case tree.op when "op_char" addTransition(entry, wayOut, tree.char) when "op_empty" addTransition(entry, wayOut, EMPTY) when "op_union" genNfa(tree.left, entry, wayOut) genNfa(tree.right, entry, wayOut) when "op_closure" a1 = genNode a2 = genNode addTransition(entry, a1, EMPTY) genNfa(tree.left, a1, a2) addTransition(a2, a1, EMPTY) addTransition(a1, wayOut, EMPTY) when "op_concat" a1 = genNode genNfa(tree.left, entry, a1) genNfa(tree.right, a1, wayOut) else raise StandardError end end end class NList attr_reader :c, :to attr_accessor :nextn def initialize(c, to) @c = c @to = to @nextn = nil end end class DfaState attr_accessor :c, :nfaState, :to, :nextn, :finishd def initialize(c, nfaState, to) @c = c @nfaState = nfaState @to = to @nextn = nil @finishd = nil end end class Nfa2Dfa attr_reader :dfa def initialize(nfa) @nfa = nfa @dfa = [] @dfaState = 0 @mark = Hash.new(0) @nodeptr = Hash.new end def build charSet = collectChar([0]) charSet.each do |char| @mark[char + collectState(0, char).join] += 1 node = DfaState.new(char, collectState(0, char), @dfaState += 1) if @dfa[0] == nil @dfa[0] = node else node.nextn = @dfa[0] @dfa[0] = node end ptr = @dfa[0] while ptr != nil build2(ptr) ptr = ptr.nextn end end end def build2(dfa) charSet = collectChar(dfa.nfaState) charSet.each do |char| stateSet = [] dfa.nfaState.each do |state| stateSet |= collectState(state, char) end if @mark[char + stateSet.join] == 0 @mark[char + stateSet.join] += 1 node = DfaState.new(char, stateSet, @dfaState += 1) @nodeptr[char + stateSet.join] = node elsif @mark[char + stateSet.join] == 1 node = @nodeptr[char + stateSet.join].dup node.nextn = nil node.finishd = true end if @dfa[dfa.to] == nil @dfa[dfa.to] = node else node.nextn = @dfa[dfa.to] @dfa[dfa.to] = node end end ptr = @dfa[dfa.to] while ptr != nil build2(ptr) unless ptr.finishd ptr = ptr.nextn end end def collectState(state, char) newState = [] ptr = @nfa[state] if ptr != nil while ptr.c != char && ptr.nextn != nil ptr = ptr.nextn end if ptr.c == char newState << ptr.to ptr = @nfa[ptr.to] while ptr != nil && ptr.c == Tree2Nfa::EMPTY newState << ptr.to ptr = @nfa[ptr.to] end end end newState.sort! end def collectChar(nfaState) charSet = [] nfaState.each do |state| ptr = @nfa[state] while ptr != nil if ptr.c != Tree2Nfa::EMPTY charSet |= [ptr.c] end ptr = ptr.nextn end end charSet.sort! return charSet end end class Token Char = "Char" Union = "Union" Lpar = "Lpar" Rpar = "Rpar" Star = "Star" Plus = "Plus" End = "End" attr_reader :type, :char def initialize(type, char = nil) @type = type @char = char end end if __FILE__ == $0 reg = "a(a|b)*a" scanner = Scanner.new(reg) tokenList = scanner.createTokenList parser = Parser.new(tokenList) tree = parser.parseRegexp nfa = Tree2Nfa.new(tree) nfaTable = nfa.build dfa = Nfa2Dfa.new(nfaTable) dfa.build pp dfa.dfa end
"a(a|b)*a"をDFAにしたものが以下です。
実行結果
mmasa@debian:~/work/ruby/regexpression$ ruby regexp.rb [#<DfaState:0xb7d1f710 ←(0) @c="a", @finishd=nil, @nextn=nil, @nfaState=[2, 3, 4], @to=1>, #<DfaState:0xb7d1f184 ←(1) @c="b", @finishd=nil, @nextn= #<DfaState:0xb7d1f3c8 @c="a", @finishd=nil, @nextn=nil, @nfaState=[1, 2, 4, 5], @to=2>, @nfaState=[2, 4, 5], @to=3>, #<DfaState:0xb7d1e658 ←(2) @c="b", @finishd=true, @nextn= #<DfaState:0xb7d1e89c @c="a", @finishd=true, @nextn=nil, @nfaState=[1, 2, 4, 5], @to=2>, @nfaState=[2, 4, 5], @to=3>, #<DfaState:0xb7d1ebe4 ←(3) @c="b", @finishd=true, @nextn= #<DfaState:0xb7d1edec @c="a", @finishd=true, @nextn=nil, @nfaState=[1, 2, 4, 5], @to=2>, @nfaState=[2, 4, 5], @to=3>]
ちょっと見にくいので、実行結果に状態を(0)〜(3)で書き込んでいます。
例えば、"abaa"という文字列の場合は、
(0)→(2)→(3)→(2)
のように、遷移していきます。