To edit pages or tickets please login with username/password: aaf/aaf

Changeset 91

Show
Ignore:
Timestamp:
09/09/06 17:04:17 (2 years ago)
Author:
jk
Message:

r1103@monsoon: jk | 2006-09-09 14:42:55 +0200
extract more_like_this into it's own module and file

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/plugin/acts_as_ferret/lib/acts_as_ferret.rb

    r88 r91  
    545545       
    546546      module InstanceMethods 
     547        include MoreLikeThis 
    547548         
    548549        # re-eneable ferret indexing after a call to #disable_ferret 
     
    643644        # else, too... 
    644645 
    645  
    646         # returns other instances of this class, which have similar contents 
    647         # like this one. Basically works like this: find out n most interesting 
    648         # (i.e. characteristic) terms from this document, and then build a 
    649         # query from those which is run against the whole index. Which terms 
    650         # are interesting is decided on variour criteria which can be 
    651         # influenced by the given options.  
    652         # 
    653         # The algorithm used here is a quite straight port of the MoreLikeThis class 
    654         # from Apache Lucene. 
    655         # 
    656         # options are: 
    657         # :field_names : Array of field names to use for similarity search (mandatory) 
    658         # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc. 
    659         # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs 
    660         # :min_word_length => nil, # Ignore words if less than this len (longer 
    661         # words tend to be more characteristic for the document they occur in). 
    662         # :max_word_length => nil, # Ignore words if greater than this len. 
    663         # :max_query_terms => 25,  # maximum number of terms in the query built 
    664         # :max_num_tokens => 5000, # maximum number of tokens to examine in a 
    665         # single field 
    666         # :boost => false,         # when true, a boost according to the 
    667         # relative score of a term is applied to this Term's TermQuery. 
    668         # :similarity => Ferret::Search::Similarity.default, # the similarity 
    669         # implementation to use 
    670         # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to 
    671         # use 
    672         # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios 
    673         # find_options : options handed over to find_by_contents 
    674         def more_like_this(options = {}, find_options = {}) 
    675           options = { 
    676             :field_names => nil,  # Default field names 
    677             :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc. 
    678             :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs 
    679             :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words. 
    680             :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words. 
    681             :max_query_terms => 25,  # maximum number of terms in the query built 
    682             :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents 
    683             :boost => false,       
    684             :similarity => Ferret::Search::Similarity.default, 
    685             :analyzer => Ferret::Analysis::StandardAnalyzer.new, 
    686             :append_to_query => nil, 
    687             :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too 
    688           }.update(options) 
    689           index = self.class.ferret_index 
    690           begin 
    691             reader = index.send(:reader) 
    692           rescue 
    693             # ferret >=0.9, C-Version doesn't allow access to Index#reader 
    694             reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false)) 
    695           end 
    696           doc_number = self.document_number 
    697           term_freq_map = retrieve_terms(document_number, reader, options) 
    698           priority_queue = create_queue(term_freq_map, reader, options) 
    699           query = create_query(priority_queue, options) 
    700           options[:append_to_query].call(query) if options[:append_to_query] 
    701           options[:base_class].find_by_contents(query, find_options) 
    702         end 
    703  
    704          
    705         def create_query(priority_queue, options={}) 
    706           query = Ferret::Search::BooleanQuery.new 
    707           qterms = 0 
    708           best_score = nil 
    709           while(cur = priority_queue.pop) 
    710             term_query = Ferret::Search::TermQuery.new(cur.to_term) 
    711              
    712             if options[:boost] 
    713               # boost term according to relative score 
    714               # TODO untested 
    715               best_score ||= cur.score 
    716               term_query.boost = cur.score / best_score 
    717             end 
    718             begin 
    719               query.add_query(term_query, :should)  
    720             rescue Ferret::Search::BooleanQuery::TooManyClauses 
    721               break 
    722             end 
    723             qterms += 1 
    724             break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms] 
    725           end 
    726           # exclude ourselves 
    727           t = Ferret::Index::Term.new('id', self.id.to_s) 
    728           query.add_query(Ferret::Search::TermQuery.new(t), :must_not) 
    729           return query 
    730         end 
    731  
    732          
    733         def document_number 
    734           hits = self.class.ferret_index.search("id:#{self.id}") 
    735           hits.each { |hit, score| return hit } 
    736         end 
    737  
    738         # creates a term/term_frequency map for terms from the fields 
    739         # given in options[:field_names] 
    740         def retrieve_terms(doc_number, reader, options) 
    741           field_names = options[:field_names] 
    742           max_num_tokens = options[:max_num_tokens] 
    743           term_freq_map = Hash.new(0) 
    744           doc = nil 
    745           field_names.each do |field| 
    746             term_freq_vector = reader.get_term_vector(document_number, field) 
    747             if term_freq_vector 
    748               # use stored term vector 
    749               # TODO untested 
    750               term_freq_vector.terms.each_with_index do |term, i| 
    751                 term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term, options) 
    752               end 
    753             else 
    754               # no term vector stored, but we have stored the contents in the index 
    755               # -> extract terms from there 
    756               doc ||= reader.get_document(doc_number) 
    757               content = doc[field] 
    758               unless content 
    759                 # no term vector, no stored content, so try content from this instance 
    760                 content = content_for_field_name(field) 
    761               end 
    762               token_count = 0 
    763                
    764               # C-Ferret >=0.9 again, no #each in tokenstream :-( 
    765               ts = options[:analyzer].token_stream(field, content) 
    766               while token = ts.next 
    767               #options[:analyzer].token_stream(field, doc[field]).each do |token| 
    768                 break if (token_count+=1) > max_num_tokens 
    769                 next if noise_word?(token_text(token), options) 
    770                 term_freq_map[token_text(token)] += 1 
    771               end 
    772             end 
    773           end 
    774           term_freq_map 
    775         end 
    776  
    777         # extract textual value of a token 
    778         def token_text(token) 
    779           # token.term_text is for ferret 0.3.2 
    780           token.respond_to?(:text) ? token.text : token.term_text 
    781         end 
    782  
    783         # create an ordered(by score) list of word,fieldname,score  
    784         # structures 
    785         def create_queue(term_freq_map, reader, options) 
    786           pq = Array.new(term_freq_map.size) 
    787            
    788           similarity = options[:similarity] 
    789           num_docs = reader.num_docs 
    790           term_freq_map.each_pair do |word, tf| 
    791             # filter out words that don't occur enough times in the source 
    792             next if options[:min_term_freq] && tf < options[:min_term_freq] 
    793              
    794             # go through all the fields and find the largest document frequency 
    795             top_field = options[:field_names].first 
    796             doc_freq = 0 
    797             options[:field_names].each do |field_name|  
    798               freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word)) 
    799               if freq > doc_freq  
    800                 top_field = field_name 
    801                 doc_freq = freq 
    802               end 
    803             end 
    804             # filter out words that don't occur in enough docs 
    805             next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq] 
    806             next if doc_freq == 0 # index update problem ? 
    807              
    808             idf = similarity.idf(doc_freq, num_docs) 
    809             score = tf * idf 
    810             pq << FrequencyQueueItem.new(word, top_field, score) 
    811           end 
    812           pq.compact! 
    813           pq.sort! { |a,b| a.score<=>b.score } 
    814           return pq 
    815         end 
    816          
    817         def noise_word?(text, options) 
    818           len = text.length 
    819           ( 
    820             (options[:min_word_length] > 0 && len < options[:min_word_length]) || 
    821             (options[:max_word_length] > 0 && len > options[:max_word_length]) || 
    822             (options[:stop_words] && options.include?(text)) 
    823           ) 
    824         end 
    825  
    826         def content_for_field_name(field) 
    827           self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym) 
    828         end 
    829  
    830       end 
    831  
    832       class FrequencyQueueItem 
    833         attr_reader :word, :field, :score 
    834         def initialize(word, field, score) 
    835           @word = word; @field = field; @score = score 
    836         end 
    837         def to_term 
    838           Ferret::Index::Term.new(self.field, self.word) 
    839         end 
    840646      end 
    841647       

To edit pages or tickets please login with username/password: aaf/aaf