id:rahaema:20080623に引き続き、「転置インデックスによる検索システムを作ってみよう!」を参考に、学習。
学習のポイントは、
- QDBMの練習
- Marshalの練習
です。
index.rb
require 'depot' num = 0 idx = Hash.new{|h, key| h[key] = [] } while line = gets() line.chomp! next unless %r|\A(\d+) (.+)\z| =~ line id, c = $1, $2 char = [] c.scan(/\w/){|matched| char << matched } seen = Hash.new char.each_with_index do |ele, i| break if i == char.size-1 bigram = char[i..i+1].to_s next if seen[bigram] idx[bigram] << id seen[bigram] = 1 end num += 1 end Depot::new("test.qdbm", Depot::OWRITER | Depot::OCREAT) do |depot| idx.sort_by{|key, value| key}.each do |ele| depot[ele[0]] = Marshal.dump(ele[1]) end depot["NUM"] = num.to_s end
search.rb
require 'depot' line = nil score = Hash.new(0) line = ARGV[0] tf = Hash.new(0) char = [] line.scan(/\w/){|matched| char << matched } char.each_with_index do |ele, i| break if i == char.size-1 tf[char[i..i+1].to_s] += 1 end Depot::new("test.qdbm") do |depot| num = depot["NUM"].to_f tf.each do |key, value| if depot[key] df = Marshal.load(depot[key]).size else df = 0 end idf = Math.log(num / (df + 1)) tfidf = tf[key] * idf if depot[key] Marshal.load(depot[key]).each do |ele| score[ele] += tfidf end end end end puts line score.sort{|a, b| b[1] <=> a[1] }.each{|key, value| puts "ID:#{key} SCORE:#{value}" }