| 645 | | |
|---|
| 646 | | # returns other instances of this class, which have similar contents |
|---|
| 647 | | # like this one. Basically works like this: find out n most interesting |
|---|
| 648 | | # (i.e. characteristic) terms from this document, and then build a |
|---|
| 649 | | # query from those which is run against the whole index. Which terms |
|---|
| 650 | | # are interesting is decided on variour criteria which can be |
|---|
| 651 | | # influenced by the given options. |
|---|
| 652 | | # |
|---|
| 653 | | # The algorithm used here is a quite straight port of the MoreLikeThis class |
|---|
| 654 | | # from Apache Lucene. |
|---|
| 655 | | # |
|---|
| 656 | | # options are: |
|---|
| 657 | | # :field_names : Array of field names to use for similarity search (mandatory) |
|---|
| 658 | | # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
|---|
| 659 | | # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
|---|
| 660 | | # :min_word_length => nil, # Ignore words if less than this len (longer |
|---|
| 661 | | # words tend to be more characteristic for the document they occur in). |
|---|
| 662 | | # :max_word_length => nil, # Ignore words if greater than this len. |
|---|
| 663 | | # :max_query_terms => 25, # maximum number of terms in the query built |
|---|
| 664 | | # :max_num_tokens => 5000, # maximum number of tokens to examine in a |
|---|
| 665 | | # single field |
|---|
| 666 | | # :boost => false, # when true, a boost according to the |
|---|
| 667 | | # relative score of a term is applied to this Term's TermQuery. |
|---|
| 668 | | # :similarity => Ferret::Search::Similarity.default, # the similarity |
|---|
| 669 | | # implementation to use |
|---|
| 670 | | # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to |
|---|
| 671 | | # use |
|---|
| 672 | | # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios |
|---|
| 673 | | # find_options : options handed over to find_by_contents |
|---|
| 674 | | def more_like_this(options = {}, find_options = {}) |
|---|
| 675 | | options = { |
|---|
| 676 | | :field_names => nil, # Default field names |
|---|
| 677 | | :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
|---|
| 678 | | :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
|---|
| 679 | | :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words. |
|---|
| 680 | | :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words. |
|---|
| 681 | | :max_query_terms => 25, # maximum number of terms in the query built |
|---|
| 682 | | :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents |
|---|
| 683 | | :boost => false, |
|---|
| 684 | | :similarity => Ferret::Search::Similarity.default, |
|---|
| 685 | | :analyzer => Ferret::Analysis::StandardAnalyzer.new, |
|---|
| 686 | | :append_to_query => nil, |
|---|
| 687 | | :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too |
|---|
| 688 | | }.update(options) |
|---|
| 689 | | index = self.class.ferret_index |
|---|
| 690 | | begin |
|---|
| 691 | | reader = index.send(:reader) |
|---|
| 692 | | rescue |
|---|
| 693 | | # ferret >=0.9, C-Version doesn't allow access to Index#reader |
|---|
| 694 | | reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false)) |
|---|
| 695 | | end |
|---|
| 696 | | doc_number = self.document_number |
|---|
| 697 | | term_freq_map = retrieve_terms(document_number, reader, options) |
|---|
| 698 | | priority_queue = create_queue(term_freq_map, reader, options) |
|---|
| 699 | | query = create_query(priority_queue, options) |
|---|
| 700 | | options[:append_to_query].call(query) if options[:append_to_query] |
|---|
| 701 | | options[:base_class].find_by_contents(query, find_options) |
|---|
| 702 | | end |
|---|
| 703 | | |
|---|
| 704 | | |
|---|
| 705 | | def create_query(priority_queue, options={}) |
|---|
| 706 | | query = Ferret::Search::BooleanQuery.new |
|---|
| 707 | | qterms = 0 |
|---|
| 708 | | best_score = nil |
|---|
| 709 | | while(cur = priority_queue.pop) |
|---|
| 710 | | term_query = Ferret::Search::TermQuery.new(cur.to_term) |
|---|
| 711 | | |
|---|
| 712 | | if options[:boost] |
|---|
| 713 | | # boost term according to relative score |
|---|
| 714 | | # TODO untested |
|---|
| 715 | | best_score ||= cur.score |
|---|
| 716 | | term_query.boost = cur.score / best_score |
|---|
| 717 | | end |
|---|
| 718 | | begin |
|---|
| 719 | | query.add_query(term_query, :should) |
|---|
| 720 | | rescue Ferret::Search::BooleanQuery::TooManyClauses |
|---|
| 721 | | break |
|---|
| 722 | | end |
|---|
| 723 | | qterms += 1 |
|---|
| 724 | | break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms] |
|---|
| 725 | | end |
|---|
| 726 | | # exclude ourselves |
|---|
| 727 | | t = Ferret::Index::Term.new('id', self.id.to_s) |
|---|
| 728 | | query.add_query(Ferret::Search::TermQuery.new(t), :must_not) |
|---|
| 729 | | return query |
|---|
| 730 | | end |
|---|
| 731 | | |
|---|
| 732 | | |
|---|
| 733 | | def document_number |
|---|
| 734 | | hits = self.class.ferret_index.search("id:#{self.id}") |
|---|
| 735 | | hits.each { |hit, score| return hit } |
|---|
| 736 | | end |
|---|
| 737 | | |
|---|
| 738 | | # creates a term/term_frequency map for terms from the fields |
|---|
| 739 | | # given in options[:field_names] |
|---|
| 740 | | def retrieve_terms(doc_number, reader, options) |
|---|
| 741 | | field_names = options[:field_names] |
|---|
| 742 | | max_num_tokens = options[:max_num_tokens] |
|---|
| 743 | | term_freq_map = Hash.new(0) |
|---|
| 744 | | doc = nil |
|---|
| 745 | | field_names.each do |field| |
|---|
| 746 | | term_freq_vector = reader.get_term_vector(document_number, field) |
|---|
| 747 | | if term_freq_vector |
|---|
| 748 | | # use stored term vector |
|---|
| 749 | | # TODO untested |
|---|
| 750 | | term_freq_vector.terms.each_with_index do |term, i| |
|---|
| 751 | | term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term, options) |
|---|
| 752 | | end |
|---|
| 753 | | else |
|---|
| 754 | | # no term vector stored, but we have stored the contents in the index |
|---|
| 755 | | # -> extract terms from there |
|---|
| 756 | | doc ||= reader.get_document(doc_number) |
|---|
| 757 | | content = doc[field] |
|---|
| 758 | | unless content |
|---|
| 759 | | # no term vector, no stored content, so try content from this instance |
|---|
| 760 | | content = content_for_field_name(field) |
|---|
| 761 | | end |
|---|
| 762 | | token_count = 0 |
|---|
| 763 | | |
|---|
| 764 | | # C-Ferret >=0.9 again, no #each in tokenstream :-( |
|---|
| 765 | | ts = options[:analyzer].token_stream(field, content) |
|---|
| 766 | | while token = ts.next |
|---|
| 767 | | #options[:analyzer].token_stream(field, doc[field]).each do |token| |
|---|
| 768 | | break if (token_count+=1) > max_num_tokens |
|---|
| 769 | | next if noise_word?(token_text(token), options) |
|---|
| 770 | | term_freq_map[token_text(token)] += 1 |
|---|
| 771 | | end |
|---|
| 772 | | end |
|---|
| 773 | | end |
|---|
| 774 | | term_freq_map |
|---|
| 775 | | end |
|---|
| 776 | | |
|---|
| 777 | | # extract textual value of a token |
|---|
| 778 | | def token_text(token) |
|---|
| 779 | | # token.term_text is for ferret 0.3.2 |
|---|
| 780 | | token.respond_to?(:text) ? token.text : token.term_text |
|---|
| 781 | | end |
|---|
| 782 | | |
|---|
| 783 | | # create an ordered(by score) list of word,fieldname,score |
|---|
| 784 | | # structures |
|---|
| 785 | | def create_queue(term_freq_map, reader, options) |
|---|
| 786 | | pq = Array.new(term_freq_map.size) |
|---|
| 787 | | |
|---|
| 788 | | similarity = options[:similarity] |
|---|
| 789 | | num_docs = reader.num_docs |
|---|
| 790 | | term_freq_map.each_pair do |word, tf| |
|---|
| 791 | | # filter out words that don't occur enough times in the source |
|---|
| 792 | | next if options[:min_term_freq] && tf < options[:min_term_freq] |
|---|
| 793 | | |
|---|
| 794 | | # go through all the fields and find the largest document frequency |
|---|
| 795 | | top_field = options[:field_names].first |
|---|
| 796 | | doc_freq = 0 |
|---|
| 797 | | options[:field_names].each do |field_name| |
|---|
| 798 | | freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word)) |
|---|
| 799 | | if freq > doc_freq |
|---|
| 800 | | top_field = field_name |
|---|
| 801 | | doc_freq = freq |
|---|
| 802 | | end |
|---|
| 803 | | end |
|---|
| 804 | | # filter out words that don't occur in enough docs |
|---|
| 805 | | next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq] |
|---|
| 806 | | next if doc_freq == 0 # index update problem ? |
|---|
| 807 | | |
|---|
| 808 | | idf = similarity.idf(doc_freq, num_docs) |
|---|
| 809 | | score = tf * idf |
|---|
| 810 | | pq << FrequencyQueueItem.new(word, top_field, score) |
|---|
| 811 | | end |
|---|
| 812 | | pq.compact! |
|---|
| 813 | | pq.sort! { |a,b| a.score<=>b.score } |
|---|
| 814 | | return pq |
|---|
| 815 | | end |
|---|
| 816 | | |
|---|
| 817 | | def noise_word?(text, options) |
|---|
| 818 | | len = text.length |
|---|
| 819 | | ( |
|---|
| 820 | | (options[:min_word_length] > 0 && len < options[:min_word_length]) || |
|---|
| 821 | | (options[:max_word_length] > 0 && len > options[:max_word_length]) || |
|---|
| 822 | | (options[:stop_words] && options.include?(text)) |
|---|
| 823 | | ) |
|---|
| 824 | | end |
|---|
| 825 | | |
|---|
| 826 | | def content_for_field_name(field) |
|---|
| 827 | | self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym) |
|---|
| 828 | | end |
|---|
| 829 | | |
|---|
| 830 | | end |
|---|
| 831 | | |
|---|
| 832 | | class FrequencyQueueItem |
|---|
| 833 | | attr_reader :word, :field, :score |
|---|
| 834 | | def initialize(word, field, score) |
|---|
| 835 | | @word = word; @field = field; @score = score |
|---|
| 836 | | end |
|---|
| 837 | | def to_term |
|---|
| 838 | | Ferret::Index::Term.new(self.field, self.word) |
|---|
| 839 | | end |
|---|