To edit pages or tickets please login with username/password: aaf/aaf

Changeset 41

Show
Ignore:
Timestamp:
05/04/06 22:42:52 (3 years ago)
Author:
jk
Message:

added more_like_this method to query an index for similar items

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/plugin/acts_as_ferret/lib/acts_as_ferret.rb

    r38 r41  
    2020 
    2121require 'active_record' 
     22 
     23# Ferret 0.3.2 is considered the most reliable ferret version for now, all unit 
     24# tests should pass (with or w/o the C extension). Speed is not as good as with the 
     25# C-only Ferret 0.9.1, but still fast enough for common scenarios and work 
     26# loads. Until Ferret 0.9.x stabilizes, you should consider this 
     27# version for production scenarios. 
     28#require_gem 'ferret', '=0.3.2' 
     29 
     30# Ferret >=0.9, Ruby-only, is much slower than 0.3.2 with it's small C 
     31# extension, so it's not really an option. 
     32# some tests related to searching multiple indexes at once are failing here 
     33# (returning more results than expected)  
     34#require 'rferret' 
     35 
     36# This will use the most recent installed ferret version, usually this will be 
     37# 0.9.1 in the C-flavour. Difficult topic, as some parts of the API is not  
     38# accessible yet. Several tests fail with this version, but basic single-index 
     39# functionality is there and working. 
    2240require 'ferret' 
    2341 
     
    486504          return doc 
    487505        end 
    488          
     506 
     507        # BIG TODO: this file really gets too big. need to refactor a bit... 
     508        # maybe extract the more like this stuff, could be useful somewhere 
     509        # else, too... 
     510 
     511 
     512        # returns other instances of this class, which have similar contents 
     513        # like this one. Basically works like this: find out n most interesting 
     514        # (i.e. characteristic) terms from this document, and then build a 
     515        # query from those which is run against the whole index. Which terms 
     516        # are interesting is decided on variour criteria which can be 
     517        # influenced by the given options.  
     518        # 
     519        # The algorithm used here is a quite straight port of the MoreLikeThis class 
     520        # from Apache Lucene. 
     521        # 
     522        # options are: 
     523        # :field_names : Array of field names to use for similarity search (mandatory) 
     524        # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc. 
     525        # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs 
     526        # :min_word_length => nil, # Ignore words if less than this len (longer 
     527        # words tend to be more characteristic for the document they occur in). 
     528        # :max_word_length => nil, # Ignore words if greater than this len. 
     529        # :max_query_terms => 25,  # maximum number of terms in the query built 
     530        # :max_num_tokens => 5000, # maximum number of tokens to examine in a 
     531        # single field 
     532        # :boost => false,         # when true, a boost according to the 
     533        # relative score of a term is applied to this Term's TermQuery. 
     534        # :similarity => Ferret::Search::Similarity.default, # the similarity 
     535        # implementation to use 
     536        # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to 
     537        # use 
     538        def more_like_this(options={}) 
     539          options = { 
     540            :field_names => nil,  # Default field names 
     541            :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc. 
     542            :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs 
     543            :min_word_length => nil, # Ignore words if less than this len. 
     544            :max_word_length => nil, # Ignore words if greater than this len. 
     545            :max_query_terms => 25,  # maximum number of terms in the query built 
     546            :max_num_tokens => 5000, 
     547            :boost => false,       
     548            :similarity => Ferret::Search::Similarity.default, 
     549            :analyzer => Ferret::Analysis::StandardAnalyzer.new 
     550          }.update(options) 
     551          index = self.class.ferret_index 
     552          begin 
     553            reader = index.send(:reader) 
     554          rescue 
     555            # ferret >=0.9, C-Version doesn't allow access to Index#reader 
     556            reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false)) 
     557          end 
     558          doc_number = self.document_number 
     559          term_freq_map = retrieve_terms(document_number, reader, options) 
     560          priority_queue = create_queue(term_freq_map, reader, options) 
     561          query = create_query(priority_queue, options) 
     562          self.class.find_by_contents(query) 
     563        end 
     564 
     565         
     566        def create_query(priority_queue, options={}) 
     567          query = Ferret::Search::BooleanQuery.new 
     568          qterms = 0 
     569          best_score = 0 
     570          while(cur = priority_queue.pop) 
     571            term_query = Ferret::Search::TermQuery.new(cur.to_term) 
     572             
     573            if options[:boost] 
     574              # boost term according to relative score 
     575              # TODO untested 
     576              best_score ||= cur.score 
     577              term_query.boost = cur.score / best_score 
     578            end 
     579            begin 
     580              query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD)  
     581            rescue Ferret::Search::BooleanQuery::TooManyClauses 
     582              break 
     583            end 
     584            qterms += 1 
     585            break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms] 
     586          end 
     587          # exclude ourselves 
     588          t = Ferret::Index::Term.new('id', self.id.to_s) 
     589          query.add_query(Ferret::Search::TermQuery.new(t), 
     590                          Ferret::Search::BooleanClause::Occur::MUST_NOT) 
     591          return query 
     592        end 
     593 
     594         
     595        def document_number 
     596          hits = self.class.ferret_index.search("id:#{self.id}") 
     597          hits.each { |hit, score| return hit } 
     598        end 
     599 
     600        # creates a term/term_frequency map for terms from the fields 
     601        # given in options[:field_names] 
     602        def retrieve_terms(doc_number, reader, options) 
     603          field_names = options[:field_names] 
     604          max_num_tokens = options[:max_num_tokens] 
     605          term_freq_map = Hash.new(0) 
     606          field_names.each do |field| 
     607            term_freq_vector = reader.get_term_vector(document_number, field) 
     608            if term_freq_vector 
     609              # use stored term vector 
     610              # TODO untested 
     611              term_freq_vector.terms.each_with_index do |term, i| 
     612                term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term) 
     613              end 
     614            else 
     615              # no term vector stored, extract terms from document content 
     616              # TODO: if no content stored, maybe use content from self ? 
     617              doc = reader.get_document(doc_number) 
     618              token_count = 0 
     619               
     620              # C-Ferret >=0.9 again, no #each in tokenstream :-( 
     621              ts = options[:analyzer].token_stream(field, doc[field]) 
     622              while token = ts.next 
     623              #options[:analyzer].token_stream(field, doc[field]).each do |token| 
     624                break if (token_count+=1) > max_num_tokens 
     625                 
     626                next if noise_word?(token_text(token)) 
     627                term_freq_map[token_text(token)] += 1 
     628              end 
     629            end 
     630          end 
     631          term_freq_map 
     632        end 
     633 
     634        # extract textual value of a token 
     635        def token_text(token) 
     636          # token.term_text is for ferret 0.3.2 
     637          token.respond_to?(:text) ? token.text : token.term_text 
     638        end 
     639 
     640        # create an ordered(by score) list of word,fieldname,score  
     641        # structures 
     642        def create_queue(term_freq_map, reader, options) 
     643          pq = Array.new(term_freq_map.size) 
     644           
     645          similarity = options[:similarity] 
     646          num_docs = reader.num_docs 
     647          term_freq_map.each_pair do |word, tf| 
     648            # filter out words that don't occur enough times in the source 
     649            next if options[:min_term_freq] && tf < options[:min_term_freq] 
     650             
     651            # go through all the fields and find the largest document frequency 
     652            top_field = options[:field_names].first 
     653            doc_freq = 0 
     654            options[:field_names].each do |field_name|  
     655              freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word)) 
     656              if freq > doc_freq  
     657                top_field = field_name 
     658                doc_freq = freq 
     659              end 
     660            end 
     661            # filter out words that don't occur in enough docs 
     662            next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq] 
     663            next if doc_freq == 0 # index update problem ? 
     664             
     665            idf = similarity.idf(doc_freq, num_docs) 
     666            score = tf * idf 
     667            pq << FrequencyQueueItem.new(word, top_field, score) 
     668          end 
     669          pq.compact! 
     670          pq.sort! { |a,b| a.score<=>b.score } 
     671          return pq 
     672        end 
     673         
     674        def noise_word?(text) 
     675          false 
     676        end 
     677 
    489678      end 
     679 
     680      class FrequencyQueueItem 
     681        attr_reader :word, :field, :score 
     682        def initialize(word, field, score) 
     683          @word = word; @field = field; @score = score 
     684        end 
     685        def to_term 
     686          Ferret::Index::Term.new(self.field, self.word) 
     687        end 
     688      end 
     689       
    490690    end 
    491691  end 

To edit pages or tickets please login with username/password: aaf/aaf