Mae向きなブログ

Mae向きな情報発信を続けていきたいと思います。

正規表現の実現3

id:rahaema:20080702, id:rahaema:20080703 の続きです。
id:rahaema:20080702 では、「正規表現をパースして構文木を作る」。id:rahaema:20080703 では、「構文木からNFAを作成する」まででしたが、ようやく、「NFAからDFAを作成する」までたどり着くことができました。

前回同様、場当たり的で、見るに耐えないものになってしまいましたが、正規表現がどのように実現されているのかというのは自分なりに理解を深めることができたと思います。

regexp.rb

require 'pp'

class Scanner
  def initialize(reg)
    @reg = reg
    @list = []
  end
  def createTokenList
    @reg.split(//).each do |char|
      case char
      when "|"
        @list << Token.new(Token::Union)
      when "("
        @list << Token.new(Token::Lpar)
      when ")"
        @list << Token.new(Token::Rpar)
      when "*"
        @list << Token.new(Token::Star)
      when "+"
        @list << Token.new(Token::Plus)
      else
        @list << Token.new(Token::Char, char)
      end
    end
    @list << Token.new(Token::End)
    return @list
  end
end

class Parser
  # <regexp> ::= <term> | <term> '|' <regexp>
  # <term>   ::= EMPTY  | <factor> <term>
  # <factor> ::= <primary> | <primary>'*' | <primary>'+'
  # <primary>::= CHARACTER | '(' <regexp> ')'
  def initialize(tokenList)
    @tokenList = tokenList
    @current = nil
  end
  def parseRegexp
    nextToken
    t = regexp
    if @current.type != "End"
      raise SyntaxError
    end
    return t
  end
  def regexp
    x = term
    while @current.type == "Union"
      nextToken
      x = Node.new("op_union", x, term, nil)
    end
    return x
  end
  def term
    if @current.type == "Union" || 
        @current.type == "Rpar" || 
        @current.type == "End"
      x = Node.new("op_empty", nil, nil, nil)
    else
      x = factor
      while @current.type != "Union" && 
          @current.type != "Rpar" && 
          @current.type != "End"
        x = Node.new("op_concat", x, factor, nil)
      end
    end
    return x
  end
  def factor
    x = primary
    if @current.type == "Star"
      x = Node.new("op_closure", x, nil, nil)
      nextToken
    elsif @current.type == "Plus"
      x = Node.new("op_concat", x, 
                   Node.new("op_closure", x, nil, nil),
                   nil)
      nextToken
    end
    return x
  end
  def primary
    if @current.type == "Char"
      x = Node.new("op_char", nil, nil, @current.char)
      nextToken
    elsif @current.type == "Lpar"
      nextToken
      x = regexp
      if @current.type != "Rpar"
        raise SyntaxError
      end
      nextToken
    else
      raise SyntaxError
    end
    return x
  end
  def nextToken
    @current = @tokenList.shift
  end
  class SyntaxError < StandardError; end
end

class Node
  attr_reader :op, :left, :right, :char
  def initialize(op, left, right, char)
    @op = op
    @left = left
    @right = right
    @char = char
  end
end

class Tree2Nfa
  EMPTY = -1
  attr_reader :tree, :nfa_nstate, :nfa
  def initialize(tree)
    @tree = tree
    @nfa_nstate = 0
    @nfa = []
  end
  def build
    nfa_entry = genNode
    nfa_exit = genNode
    genNfa(@tree, nfa_entry, nfa_exit)
    return @nfa
  end
  def genNode
    retval = @nfa_nstate
    @nfa_nstate += 1
    return retval
  end
  def addTransition(from, to, c)
    p = NList.new(c, to)
    p.nextn = @nfa[from]
    @nfa[from] = p
  end
  def genNfa(tree, entry, wayOut)
    case tree.op
    when "op_char"
      addTransition(entry, wayOut, tree.char)
    when "op_empty"
      addTransition(entry, wayOut, EMPTY)
    when "op_union"
      genNfa(tree.left, entry, wayOut)
      genNfa(tree.right, entry, wayOut)
    when "op_closure"
      a1 = genNode
      a2 = genNode
      addTransition(entry, a1, EMPTY)
      genNfa(tree.left, a1, a2)
      addTransition(a2, a1, EMPTY)
      addTransition(a1, wayOut, EMPTY)
    when "op_concat"
      a1 = genNode
      genNfa(tree.left, entry, a1)
      genNfa(tree.right, a1, wayOut)
    else
      raise StandardError
    end
  end
end

class NList
  attr_reader :c, :to
  attr_accessor :nextn
  def initialize(c, to)
    @c = c
    @to = to
    @nextn = nil
  end
end

class DfaState
  attr_accessor :c, :nfaState, :to, :nextn, :finishd
  def initialize(c, nfaState, to)
    @c = c
    @nfaState = nfaState
    @to = to
    @nextn = nil
    @finishd = nil
  end
end

class Nfa2Dfa
  attr_reader :dfa
  def initialize(nfa)
    @nfa = nfa
    @dfa = []
    @dfaState = 0
    @mark = Hash.new(0)
    @nodeptr = Hash.new
  end
  def build
    charSet = collectChar([0])
    charSet.each do |char|
      @mark[char + collectState(0, char).join] += 1
      node = DfaState.new(char, collectState(0, char), @dfaState += 1)
      if @dfa[0] == nil
        @dfa[0] = node
      else
        node.nextn = @dfa[0]
        @dfa[0] = node
      end
      ptr = @dfa[0]
      while ptr != nil
        build2(ptr)
        ptr = ptr.nextn
      end
    end
  end
  def build2(dfa)
    charSet = collectChar(dfa.nfaState)
    charSet.each do |char|
      stateSet = []
      dfa.nfaState.each do |state|
        stateSet |= collectState(state, char)
      end
      if @mark[char + stateSet.join] == 0
        @mark[char + stateSet.join] += 1
        node = DfaState.new(char, stateSet, @dfaState += 1)
        @nodeptr[char + stateSet.join] = node
      elsif @mark[char + stateSet.join] == 1
        node = @nodeptr[char + stateSet.join].dup
        node.nextn = nil
        node.finishd = true
      end
      if @dfa[dfa.to] == nil
        @dfa[dfa.to] = node
      else
        node.nextn = @dfa[dfa.to]
        @dfa[dfa.to] = node
      end
    end
    ptr = @dfa[dfa.to] 
    while ptr != nil 
      build2(ptr) unless ptr.finishd
      ptr = ptr.nextn
    end
  end
  def collectState(state, char)
    newState = []
    ptr = @nfa[state]
    if ptr != nil
      while ptr.c != char && ptr.nextn != nil
        ptr = ptr.nextn
      end
      if ptr.c == char
        newState << ptr.to
        ptr = @nfa[ptr.to] 
        while ptr != nil && 
            ptr.c == Tree2Nfa::EMPTY
          newState << ptr.to
          ptr = @nfa[ptr.to]
        end
      end
    end
    newState.sort!
  end
  def collectChar(nfaState)
    charSet = []
    nfaState.each do |state|
      ptr = @nfa[state]
      while ptr != nil
        if ptr.c != Tree2Nfa::EMPTY
          charSet |= [ptr.c]
        end
        ptr = ptr.nextn
      end
    end
    charSet.sort!
    return charSet
  end
end

class Token
  Char = "Char"
  Union = "Union"
  Lpar = "Lpar"
  Rpar = "Rpar"
  Star = "Star"
  Plus = "Plus"
  End = "End"
  attr_reader :type, :char
  def initialize(type, char = nil)
    @type = type
    @char = char
  end
end

if __FILE__ == $0
  reg = "a(a|b)*a"
  scanner = Scanner.new(reg)
  tokenList = scanner.createTokenList
  parser = Parser.new(tokenList)
  tree = parser.parseRegexp
  nfa = Tree2Nfa.new(tree)
  nfaTable = nfa.build
  dfa = Nfa2Dfa.new(nfaTable)
  dfa.build
  pp dfa.dfa
end

"a(a|b)*a"をDFAにしたものが以下です。

実行結果

mmasa@debian:~/work/ruby/regexpression$ ruby regexp.rb
[#<DfaState:0xb7d1f710 ←(0)
  @c="a",
  @finishd=nil,
  @nextn=nil,
  @nfaState=[2, 3, 4],
  @to=1>,
 #<DfaState:0xb7d1f184 ←(1)
  @c="b",
  @finishd=nil,
  @nextn=
   #<DfaState:0xb7d1f3c8
    @c="a",
    @finishd=nil,
    @nextn=nil,
    @nfaState=[1, 2, 4, 5],
    @to=2>,
  @nfaState=[2, 4, 5],
  @to=3>,
 #<DfaState:0xb7d1e658 ←(2)
  @c="b",
  @finishd=true,
  @nextn=
   #<DfaState:0xb7d1e89c
    @c="a",
    @finishd=true,
    @nextn=nil,
    @nfaState=[1, 2, 4, 5],
    @to=2>,
  @nfaState=[2, 4, 5],
  @to=3>,
 #<DfaState:0xb7d1ebe4 ←(3)
  @c="b",
  @finishd=true,
  @nextn=
   #<DfaState:0xb7d1edec
    @c="a",
    @finishd=true,
    @nextn=nil,
    @nfaState=[1, 2, 4, 5],
    @to=2>,
  @nfaState=[2, 4, 5],
  @to=3>]

ちょっと見にくいので、実行結果に状態を(0)〜(3)で書き込んでいます。
例えば、"abaa"という文字列の場合は、

(0)→(2)→(3)→(2)

のように、遷移していきます。