From: Frank Lichtenheld Date: Tue, 19 Jun 2007 02:22:45 +0000 (+0200) Subject: Packages::Search: Implement find_similar X-Git-Url: https://git.deb.at/w?a=commitdiff_plain;h=9f0c86ad48e5ccdd89c2ae10fc35baee286f3adf;p=deb%2Fpackages.git Packages::Search: Implement find_similar Suggested by Enrico Zini. For a given , search for P, and then extract all the terms from the document you find (you could also implement this part by just looking up the description and applying the same algorithms to it as in parse-packages, haven't benchmarked that against each other). With these terms then make a OR search. Ignore all results , since these are obvious ;) The first few matches of the rest are usually packages very similar to the one we started with. --- diff --git a/lib/Packages/DoShow.pm b/lib/Packages/DoShow.pm index 1d21462..611d65e 100644 --- a/lib/Packages/DoShow.pm +++ b/lib/Packages/DoShow.pm @@ -120,6 +120,10 @@ sub do_show { my $std = timediff($st1, $st0); debug( "Data search and merging took ".timestr($std) ) if DEBUG; + my @similar = find_similar( $pkg, "$DBDIR/xapian/", + \%did2pkg ); + $contents{similar} = \@similar; + my $did = $page->get_newest( 'description' ); my $desc_md5 = $page->get_newest( 'description-md5' ); my @complete_tags = split(/, /, $page->get_newest( 'tag' )); diff --git a/lib/Packages/Search.pm b/lib/Packages/Search.pm index 66b0944..021f0fd 100644 --- a/lib/Packages/Search.pm +++ b/lib/Packages/Search.pm @@ -58,6 +58,7 @@ our @ISA = qw( Exporter ); our @EXPORT_OK = qw( read_entry read_entry_all read_entry_simple read_src_entry read_src_entry_all find_binaries do_names_search do_fulltext_search do_xapian_search + find_similar ); our %EXPORT_TAGS = ( all => [ @EXPORT_OK ] ); @@ -240,6 +241,50 @@ sub do_xapian_search { } } +sub find_similar { + my ($pkg, $db, $did2pkg) = @_; + + my $db = Search::Xapian::Database->new( $db ); + my $enq = $db->enquire( "P$pkg" ); + debug( "Xapian Query was: ".$enq->get_query()->get_description(), 1) if DEBUG; + my $first_match = ($enq->matches(0,1))[0]->get_document(); + + my @terms; + my $term_it = $first_match->termlist_begin(); + my $term_end = $first_match->termlist_end(); + + for ($term_it; $term_it ne $term_end; $term_it++) { + debug( "TERM: ".$term_it->get_termname(), 3); + push @terms, $term_it->get_termname(); + } + + my $rel_enq = $db->enquire( OP_OR, @terms ); + debug( "Xapian Query was: ".$rel_enq->get_query()->get_description(), 1) if DEBUG; + my @rel_pkg = $rel_enq->matches(2,20); + + use Data::Dumper; + debug(Dumper(\@rel_pkg),1); + + my (@order, %tmp_results); + foreach my $match ( @rel_pkg ) { + my $id = $match->get_docid(); + my $result = $did2pkg->{$id}; + + foreach (split /\000/o, $result) { + my @data = split /\s/, $_, 3; + debug ("Considering $data[0], arch = $data[2], relevance=".$match->get_percent(), 3) if DEBUG; + next if $data[0] eq $pkg; + unless ($tmp_results{$data[0]}++) { + push @order, $data[0]; + } + } + } + undef $db; + + debug ("ORDER: @order", 2) if DEBUG; + return @order[0..10]; +} + sub find_binaries { my ($pkg, $archive, $suite, $src2bin) = @_; diff --git a/templates/html/show.tmpl b/templates/html/show.tmpl index c4b1e90..88274f4 100644 --- a/templates/html/show.tmpl +++ b/templates/html/show.tmpl @@ -123,6 +123,15 @@

Homepage: [% url | html %]

[% END %] +[% FOREACH sim IN similar %] + [% IF loop.first %] +

Similar packages:

+ ' IF loop.last %] +[% END %] + [% END %]