Mae向きなブログ

Mae向きな情報発信を続けていきたいと思います。

正規表現の実現2

昨日(id:rahaema:20080702)の続きです。
昨日は、「正規表現をパースして構文木を作る」まででしたが、今日は、「構文木からNFAを作成する」までを取り組んでみました。

以下がRubyで書いたソースですが、場当たり的にクラスを作ったりしていて、ちょっと…です。

regexp.rb

require 'pp'

class Scanner
  def initialize(reg)
    @reg = reg
    @list = []
  end
  def createTokenList
    @reg.split(//).each do |char|
      case char
      when "|"
        @list << Token.new(Token::Union)
      when "("
        @list << Token.new(Token::Lpar)
      when ")"
        @list << Token.new(Token::Rpar)
      when "*"
        @list << Token.new(Token::Star)
      when "+"
        @list << Token.new(Token::Plus)
      else
        @list << Token.new(Token::Char, char)
      end
    end
    @list << Token.new(Token::End)
    return @list
  end
end

class Parser
  # <regexp> ::= <term> | <term> '|' <regexp>
  # <term>   ::= EMPTY  | <factor> <term>
  # <factor> ::= <primary> | <primary>'*' | <primary>'+'
  # <primary>::= CHARACTER | '(' <regexp> ')'
  def initialize(tokenList)
    @tokenList = tokenList
    @current = nil
  end
  def parseRegexp
    nextToken
    t = regexp
    if @current.type != "End"
      raise SyntaxError
    end
    return t
  end
  def regexp
    x = term
    while @current.type == "Union"
      nextToken
      x = Node.new("op_union", x, term, nil)
    end
    return x
  end
  def term
    if @current.type == "Union" || 
        @current.type == "Rpar" || 
        @current.type == "End"
      x = Node.new("op_empty", nil, nil, nil)
    else
      x = factor
      while @current.type != "Union" && 
          @current.type != "Rpar" && 
          @current.type != "End"
        x = Node.new("op_concat", x, factor, nil)
      end
    end
    return x
  end
  def factor
    x = primary
    if @current.type == "Star"
      x = Node.new("op_closure", x, nil, nil)
      nextToken
    elsif @current.type == "Plus"
      x = Node.new("op_concat", x, 
                   Node.new("op_closure", x, nil, nil),
                   nil)
      nextToken
    end
    return x
  end
  def primary
    if @current.type == "Char"
      x = Node.new("op_char", nil, nil, @current.char)
      nextToken
    elsif @current.type == "Lpar"
      nextToken
      x = regexp
      if @current.type != "Rpar"
        raise SyntaxError
      end
      nextToken
    else
      raise SyntaxError
    end
    return x
  end
  def nextToken
    @current = @tokenList.shift
  end
  class SyntaxError < StandardError; end
end

class Node
  attr_reader :op, :left, :right, :char
  def initialize(op, left, right, char)
    @op = op
    @left = left
    @right = right
    @char = char
  end
end

class Tree2Nfa
  EMPTY = -1
  attr_reader :tree, :nfa_nstate, :nfa
  def initialize(tree)
    @tree = tree
    @nfa_nstate = 0
    @nfa = []
  end
  def build
    nfa_entry = genNode
    nfa_exit = genNode
    genNfa(@tree, nfa_entry, nfa_exit)
    return @nfa
  end
  def genNode
    retval = @nfa_nstate
    @nfa_nstate += 1
    return retval
  end
  def addTransition(from, to, c)
    p = NList.new(c, to)
    p.nextn = @nfa[from]
    @nfa[from] = p
  end
  def genNfa(tree, entry, wayOut)
    case tree.op
    when "op_char"
      addTransition(entry, wayOut, tree.char)
    when "op_empty"
      addTransition(entry, wayOut, EMPTY)
    when "op_union"
      genNfa(tree.left, entry, wayOut)
      genNfa(tree.right, entry, wayOut)
    when "op_closure"
      a1 = genNode
      a2 = genNode
      addTransition(entry, a1, EMPTY)
      genNfa(tree.left, a1, a2)
      addTransition(a2, a1, EMPTY)
      addTransition(a1, wayOut, EMPTY)
    when "op_concat"
      a1 = genNode
      genNfa(tree.left, entry, a1)
      genNfa(tree.right, a1, wayOut)
    else
      raise StandardError
    end
  end
end

class NList
  attr_reader :c, :to
  attr_accessor :nextn
  def initialize(c, to)
    @c = c
    @to = to
    @nextn = nil
  end
end

class Token
  Char = "Char"
  Union = "Union"
  Lpar = "Lpar"
  Rpar = "Rpar"
  Star = "Star"
  Plus = "Plus"
  End = "End"
  attr_reader :type, :char
  def initialize(type, char = nil)
    @type = type
    @char = char
  end
end

if __FILE__ == $0
  reg = "a(a|b)*a"
  scanner = Scanner.new(reg)
  tokenList = scanner.createTokenList
  parser = Parser.new(tokenList)
  tree = parser.parseRegexp
  nfa = Tree2Nfa.new(tree)
  pp nfa.build
end

"a(a|b)*a"をNFAにしたものが以下です。

実行結果

mmasa@debian:~/work/ruby/regexpression$ ruby regexp.rb
[#<NList:0xb7c8343c @c="a", @nextn=nil, @to=3>, ←(0)
 nil, ←(1)
 #<NList:0xb7c83310 @c="a", @nextn=nil, @to=1>, ←(2)
 #<NList:0xb7c833d8 @c=-1, @nextn=nil, @to=4>,  ←(3)
 #<NList:0xb7c83338                             ←(4)
  @c=-1,
  @nextn=
   #<NList:0xb7c83324
    @c="b",
    @nextn=#<NList:0xb7c8334c @c="a", @nextn=nil, @to=5>,
    @to=5>,
  @to=2>,
 #<NList:0xb7c83388 @c=-1, @nextn=nil, @to=4>]  ←(5)

ちょっと見にくいので、実行結果に状態を(0)〜(5)で書き込んでいます。
例えば、"abaa"という文字列の場合、

(0)→(3)→(4)→(5)→(4)→(5)→(4)→(2)

のように、遷移していきます。