From: Frank Lichtenheld Date: Mon, 18 Jun 2007 17:40:28 +0000 (+0200) Subject: Fulltext search: Greatly improve by using a more fuzzy approach X-Git-Url: https://git.deb.at/?a=commitdiff_plain;h=07fdff9c69f8bd3b4d357fd61042f588701dd1c6;p=deb%2Fpackages.git Fulltext search: Greatly improve by using a more fuzzy approach Most of this done on suggestion of Enrico Zini Using OP_OR instead of OP_AND as default can actually lead to better matches because the ones found with OP_AND often don't actually are the most relevant ones. This is especially true when using more than two keywords. Accordingly sort by relevance on the result page. Improve indexing: + Add both the unstemmed and the stemmed description to index this will increase the relevance of exact matches. Only the latter is done with positional information + Really index debtags and the package name --- diff --git a/bin/parse-packages b/bin/parse-packages index 64a00cd..b0883a2 100755 --- a/bin/parse-packages +++ b/bin/parse-packages @@ -267,32 +267,50 @@ tie %descriptions_db, "DB_File", "$DBDIR/descriptions.db.new", or die "Error creating DB: $!"; print "Index $#descriptions descriptions\n"; for (my $i=1; $i<= $#descriptions; $i++) { - my $plain_description = $descriptions[$i]; # strip away additional data - my ($only_desc) = split /\000/o, $plain_description, 2; + my ($only_desc, $pkg, $tags) = split /\000/o, $descriptions[$i], 3; # WARNING: This needs to correspond with what happens in -# Packages/Search.pm:do_fulltext_search - $plain_description =~ tr [A-Z] [a-z]; - # ensure one space on both ends - $plain_description = " $plain_description "; - $plain_description =~ s/[(),.-]+//og; - $plain_description =~ s#[^a-z0-9_/+]+# #og; +# Packages/Search.pm:do_xapian_search + $only_desc =~ s#[^\w/+]+# #og; #XAPIAN eval { - my @words = split /\s+/, $plain_description; - my $stem_words = $stemmer->stem( \@words ); + my @words = split /\s+/, $only_desc; + unshift @words, $pkg; + my $doc = Search::Xapian::Document->new() or die "can't create doc object for $i: $!\n"; if ($doc->set_data($i)){ warn "can't set_data in doc object for $i: $!\n"; } - for my $j (0 .. (@$stem_words-1)) { - next if $stem_words->[$j] =~ /^\s*$/o; - if ($doc->add_posting($stem_words->[$j], $j)) { - warn "can't add word $stem_words->[$j] $j: $!\n"; + + # package with prefix + if ($doc->add_term("P$pkg")) { + warn "can't add term P$pkg: $!\n"; + } + # description, unstemmed with positional info + for my $j (0 .. (@words-1)) { + next if $words[$j] =~ /^\s*$/o; + if ($doc->add_posting($words[$j], $j)) { + warn "can't add posting $words[$j] at $j: $!\n"; } } + # description, stemmed + my $stem_words = $stemmer->stem( \@words ); + foreach my $w (@$stem_words) { + next if $w =~ /^\s*$/o; + if ($doc->add_term($w)) { + warn "can't add term $w: $!\n"; + } + } + if ($tags) { + foreach my $t (split /, /, $tags) { + if ($doc->add_term($t)) { + warn "can't add term $t: $!\n"; + } + } + } + $xapian_db->add_document($doc) or warn "failed to add document: $i\n"; }; diff --git a/lib/Packages/DoSearch.pm b/lib/Packages/DoSearch.pm index 5b0b4bb..d395a31 100644 --- a/lib/Packages/DoSearch.pm +++ b/lib/Packages/DoSearch.pm @@ -79,6 +79,14 @@ sub do_search { if (@results) { my (%pkgs, %subsect, %sect, %archives, %desc, %binaries, %provided_by); + my %sort_by_relevance; + for (1 ... scalar @results) { +# debug("$results[$_][0] => $_", 4) if DEBUG; + $sort_by_relevance{$results[$_-1][0]} = $_; + } +# use Data::Dumper; +# debug( "sort_by_relevance=".Dumper(\%sort_by_relevance), 4); + unless ($opts->{source}) { foreach (@results) { my ($pkg_t, $archive, $suite, $arch, $section, $subsection, @@ -98,7 +106,12 @@ sub do_search { } my %uniq_pkgs = map { $_ => 1 } (keys %pkgs, keys %provided_by); - my @pkgs = sort keys %uniq_pkgs; + my @pkgs; + if ($searchon eq 'names') { + @pkgs = sort keys %uniq_pkgs; + } else { + @pkgs = sort { $sort_by_relevance{$a} <=> $sort_by_relevance{$b} } keys %uniq_pkgs; + } process_packages( $page_content, 'packages', \%pkgs, \@pkgs, $opts, \@keywords, \&process_package, \%provided_by, \%archives, \%sect, \%subsect, diff --git a/lib/Packages/Search.pm b/lib/Packages/Search.pm index 671e340..66b0944 100644 --- a/lib/Packages/Search.pm +++ b/lib/Packages/Search.pm @@ -204,41 +204,38 @@ sub do_xapian_search { # NOTE: this needs to correspond with parse-packages! my @tmp; foreach my $keyword (@$keywords) { - $keyword =~ tr [A-Z] [a-z]; - if ($opts->{exact}) { - $keyword = " $keyword "; - } - $keyword =~ s/[(),.-]+//og; - $keyword =~ s;[^a-z0-9_/+]+; ;og; + $keyword =~ s;[^\w/+]+; ;og; push @tmp, $keyword; } my $stemmer = Lingua::Stem->new(); - $keywords = $stemmer->stem( @tmp ); + my $stemmed_keywords = $stemmer->stem( @tmp ); my $db = Search::Xapian::Database->new( $db ); - my $enq = $db->enquire( OP_AND, @$keywords ); + my $enq = $db->enquire( OP_OR, @$keywords, @$stemmed_keywords ); debug( "Xapian Query was: ".$enq->get_query()->get_description(), 1) if DEBUG; - my @matches = $enq->matches(0, 100); + my @matches = $enq->matches(0, 999); - my $numres = 0; - my %tmp_results; + my (@order, %tmp_results); foreach my $match ( @matches ) { my $id = $match->get_docid(); my $result = $did2pkg->{$id}; foreach (split /\000/o, $result) { my @data = split /\s/, $_, 3; -# debug ("Considering $data[0], arch = $data[2]", 3) if DEBUG; + debug ("Considering $data[0], arch = $data[2], relevance=".$match->get_percent(), 3) if DEBUG; # next unless $data[2] eq 'all' || $opts->{h_archs}{$data[2]}; # debug ("Ok", 3) if DEBUG; - $numres++ unless $tmp_results{$data[0]}++; + unless ($tmp_results{$data[0]}++) { + push @order, $data[0]; + } } - last if $numres > 100; + last if @order > 100; } undef $db; - $too_many_hits++ if $numres > 100; + $too_many_hits++ if @order > 100; - foreach my $pkg (keys %tmp_results) { + debug ("ORDER: @order", 2) if DEBUG; + foreach my $pkg (@order) { &$read_entry( $packages, $pkg, $results, $non_results, $opts ); } }