| 488 | | |
|---|
| | 506 | |
|---|
| | 507 | # BIG TODO: this file really gets too big. need to refactor a bit... |
|---|
| | 508 | # maybe extract the more like this stuff, could be useful somewhere |
|---|
| | 509 | # else, too... |
|---|
| | 510 | |
|---|
| | 511 | |
|---|
| | 512 | # returns other instances of this class, which have similar contents |
|---|
| | 513 | # like this one. Basically works like this: find out n most interesting |
|---|
| | 514 | # (i.e. characteristic) terms from this document, and then build a |
|---|
| | 515 | # query from those which is run against the whole index. Which terms |
|---|
| | 516 | # are interesting is decided on variour criteria which can be |
|---|
| | 517 | # influenced by the given options. |
|---|
| | 518 | # |
|---|
| | 519 | # The algorithm used here is a quite straight port of the MoreLikeThis class |
|---|
| | 520 | # from Apache Lucene. |
|---|
| | 521 | # |
|---|
| | 522 | # options are: |
|---|
| | 523 | # :field_names : Array of field names to use for similarity search (mandatory) |
|---|
| | 524 | # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
|---|
| | 525 | # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
|---|
| | 526 | # :min_word_length => nil, # Ignore words if less than this len (longer |
|---|
| | 527 | # words tend to be more characteristic for the document they occur in). |
|---|
| | 528 | # :max_word_length => nil, # Ignore words if greater than this len. |
|---|
| | 529 | # :max_query_terms => 25, # maximum number of terms in the query built |
|---|
| | 530 | # :max_num_tokens => 5000, # maximum number of tokens to examine in a |
|---|
| | 531 | # single field |
|---|
| | 532 | # :boost => false, # when true, a boost according to the |
|---|
| | 533 | # relative score of a term is applied to this Term's TermQuery. |
|---|
| | 534 | # :similarity => Ferret::Search::Similarity.default, # the similarity |
|---|
| | 535 | # implementation to use |
|---|
| | 536 | # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to |
|---|
| | 537 | # use |
|---|
| | 538 | def more_like_this(options={}) |
|---|
| | 539 | options = { |
|---|
| | 540 | :field_names => nil, # Default field names |
|---|
| | 541 | :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
|---|
| | 542 | :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
|---|
| | 543 | :min_word_length => nil, # Ignore words if less than this len. |
|---|
| | 544 | :max_word_length => nil, # Ignore words if greater than this len. |
|---|
| | 545 | :max_query_terms => 25, # maximum number of terms in the query built |
|---|
| | 546 | :max_num_tokens => 5000, |
|---|
| | 547 | :boost => false, |
|---|
| | 548 | :similarity => Ferret::Search::Similarity.default, |
|---|
| | 549 | :analyzer => Ferret::Analysis::StandardAnalyzer.new |
|---|
| | 550 | }.update(options) |
|---|
| | 551 | index = self.class.ferret_index |
|---|
| | 552 | begin |
|---|
| | 553 | reader = index.send(:reader) |
|---|
| | 554 | rescue |
|---|
| | 555 | # ferret >=0.9, C-Version doesn't allow access to Index#reader |
|---|
| | 556 | reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false)) |
|---|
| | 557 | end |
|---|
| | 558 | doc_number = self.document_number |
|---|
| | 559 | term_freq_map = retrieve_terms(document_number, reader, options) |
|---|
| | 560 | priority_queue = create_queue(term_freq_map, reader, options) |
|---|
| | 561 | query = create_query(priority_queue, options) |
|---|
| | 562 | self.class.find_by_contents(query) |
|---|
| | 563 | end |
|---|
| | 564 | |
|---|
| | 565 | |
|---|
| | 566 | def create_query(priority_queue, options={}) |
|---|
| | 567 | query = Ferret::Search::BooleanQuery.new |
|---|
| | 568 | qterms = 0 |
|---|
| | 569 | best_score = 0 |
|---|
| | 570 | while(cur = priority_queue.pop) |
|---|
| | 571 | term_query = Ferret::Search::TermQuery.new(cur.to_term) |
|---|
| | 572 | |
|---|
| | 573 | if options[:boost] |
|---|
| | 574 | # boost term according to relative score |
|---|
| | 575 | # TODO untested |
|---|
| | 576 | best_score ||= cur.score |
|---|
| | 577 | term_query.boost = cur.score / best_score |
|---|
| | 578 | end |
|---|
| | 579 | begin |
|---|
| | 580 | query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD) |
|---|
| | 581 | rescue Ferret::Search::BooleanQuery::TooManyClauses |
|---|
| | 582 | break |
|---|
| | 583 | end |
|---|
| | 584 | qterms += 1 |
|---|
| | 585 | break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms] |
|---|
| | 586 | end |
|---|
| | 587 | # exclude ourselves |
|---|
| | 588 | t = Ferret::Index::Term.new('id', self.id.to_s) |
|---|
| | 589 | query.add_query(Ferret::Search::TermQuery.new(t), |
|---|
| | 590 | Ferret::Search::BooleanClause::Occur::MUST_NOT) |
|---|
| | 591 | return query |
|---|
| | 592 | end |
|---|
| | 593 | |
|---|
| | 594 | |
|---|
| | 595 | def document_number |
|---|
| | 596 | hits = self.class.ferret_index.search("id:#{self.id}") |
|---|
| | 597 | hits.each { |hit, score| return hit } |
|---|
| | 598 | end |
|---|
| | 599 | |
|---|
| | 600 | # creates a term/term_frequency map for terms from the fields |
|---|
| | 601 | # given in options[:field_names] |
|---|
| | 602 | def retrieve_terms(doc_number, reader, options) |
|---|
| | 603 | field_names = options[:field_names] |
|---|
| | 604 | max_num_tokens = options[:max_num_tokens] |
|---|
| | 605 | term_freq_map = Hash.new(0) |
|---|
| | 606 | field_names.each do |field| |
|---|
| | 607 | term_freq_vector = reader.get_term_vector(document_number, field) |
|---|
| | 608 | if term_freq_vector |
|---|
| | 609 | # use stored term vector |
|---|
| | 610 | # TODO untested |
|---|
| | 611 | term_freq_vector.terms.each_with_index do |term, i| |
|---|
| | 612 | term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term) |
|---|
| | 613 | end |
|---|
| | 614 | else |
|---|
| | 615 | # no term vector stored, extract terms from document content |
|---|
| | 616 | # TODO: if no content stored, maybe use content from self ? |
|---|
| | 617 | doc = reader.get_document(doc_number) |
|---|
| | 618 | token_count = 0 |
|---|
| | 619 | |
|---|
| | 620 | # C-Ferret >=0.9 again, no #each in tokenstream :-( |
|---|
| | 621 | ts = options[:analyzer].token_stream(field, doc[field]) |
|---|
| | 622 | while token = ts.next |
|---|
| | 623 | #options[:analyzer].token_stream(field, doc[field]).each do |token| |
|---|
| | 624 | break if (token_count+=1) > max_num_tokens |
|---|
| | 625 | |
|---|
| | 626 | next if noise_word?(token_text(token)) |
|---|
| | 627 | term_freq_map[token_text(token)] += 1 |
|---|
| | 628 | end |
|---|
| | 629 | end |
|---|
| | 630 | end |
|---|
| | 631 | term_freq_map |
|---|
| | 632 | end |
|---|
| | 633 | |
|---|
| | 634 | # extract textual value of a token |
|---|
| | 635 | def token_text(token) |
|---|
| | 636 | # token.term_text is for ferret 0.3.2 |
|---|
| | 637 | token.respond_to?(:text) ? token.text : token.term_text |
|---|
| | 638 | end |
|---|
| | 639 | |
|---|
| | 640 | # create an ordered(by score) list of word,fieldname,score |
|---|
| | 641 | # structures |
|---|
| | 642 | def create_queue(term_freq_map, reader, options) |
|---|
| | 643 | pq = Array.new(term_freq_map.size) |
|---|
| | 644 | |
|---|
| | 645 | similarity = options[:similarity] |
|---|
| | 646 | num_docs = reader.num_docs |
|---|
| | 647 | term_freq_map.each_pair do |word, tf| |
|---|
| | 648 | # filter out words that don't occur enough times in the source |
|---|
| | 649 | next if options[:min_term_freq] && tf < options[:min_term_freq] |
|---|
| | 650 | |
|---|
| | 651 | # go through all the fields and find the largest document frequency |
|---|
| | 652 | top_field = options[:field_names].first |
|---|
| | 653 | doc_freq = 0 |
|---|
| | 654 | options[:field_names].each do |field_name| |
|---|
| | 655 | freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word)) |
|---|
| | 656 | if freq > doc_freq |
|---|
| | 657 | top_field = field_name |
|---|
| | 658 | doc_freq = freq |
|---|
| | 659 | end |
|---|
| | 660 | end |
|---|
| | 661 | # filter out words that don't occur in enough docs |
|---|
| | 662 | next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq] |
|---|
| | 663 | next if doc_freq == 0 # index update problem ? |
|---|
| | 664 | |
|---|
| | 665 | idf = similarity.idf(doc_freq, num_docs) |
|---|
| | 666 | score = tf * idf |
|---|
| | 667 | pq << FrequencyQueueItem.new(word, top_field, score) |
|---|
| | 668 | end |
|---|
| | 669 | pq.compact! |
|---|
| | 670 | pq.sort! { |a,b| a.score<=>b.score } |
|---|
| | 671 | return pq |
|---|
| | 672 | end |
|---|
| | 673 | |
|---|
| | 674 | def noise_word?(text) |
|---|
| | 675 | false |
|---|
| | 676 | end |
|---|
| | 677 | |
|---|