From b84419b2f87cf3c43b83fca71486be4fae81375e Mon Sep 17 00:00:00 2001
From: Frank Lichtenheld <frank@lichtenheld.de>
Date: Fri, 1 Feb 2008 01:45:41 +0100
Subject: [PATCH] Improve result sampling in case we have too generic keywords

Instead of either displaying the exact match if we have only
one keyword and nothing otherwise, lets try to give people
a sample of results, on the chance that might still be useful.

Mostly just display the first 100 package names found (i.e.
from suffix entries that had no "too many prefixes" value).
Allow suffix entries with "too many prefixes" value to specifiy
if the empty prefix was contained in the set (that effectivly
avoids a regression in the case "single keyword, exact match
exists").

If in doubt, prefer package names that begin with the first
keyword and only abort if we have more than 100 of those
(The worst case behaviour of this should be very limited
to due the "too many prefixes" constraints).
---
 bin/parse-packages     |  7 +++----
 lib/Packages/Search.pm | 35 +++++++++++++++++++++++------------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/bin/parse-packages b/bin/parse-packages
index 2e713c6..97edffc 100755
--- a/bin/parse-packages
+++ b/bin/parse-packages
@@ -336,11 +336,10 @@ tie %package_postfixes_db, "DB_File", "$DBDIR/package_postfixes.db.new",
 	or die "Error creating DB: $!";
 while (my ($k, $v) = each(%package_postfixes)) {
 	$v =~ s/.$//s;
-	my $nr = $v;
-	$nr =~ s/[^\000]//g;
-	$nr = length($nr) + 1; # < number of hits
+	my $nr = ($v =~ tr/\000/\000/) + 1;
 	if ($nr > $MAX_PACKAGE_POSTFIXES) {
-		$v = "\001" . $nr;
+	    $v = ($v =~ /\^/) ? "^\001" . $nr
+		: "\001" . $nr;
 	}
 	$package_postfixes_db{$k} = $v;
 }
diff --git a/lib/Packages/Search.pm b/lib/Packages/Search.pm
index 04270f2..c33f142 100644
--- a/lib/Packages/Search.pm
+++ b/lib/Packages/Search.pm
@@ -164,14 +164,22 @@ sub do_names_search {
 
     my $first_keyword = lc shift @$keywords;
     @$keywords = map { lc $_ } @$keywords;
-        
+
     my ($key, $prefixes) = ($first_keyword, '');
-    my %pkgs;
+    my (%pkgs, %pkgs_min);
     $postfixes->seq( $key, $prefixes, R_CURSOR );
     while (index($key, $first_keyword) >= 0) {
-	if ($prefixes =~ /^\001(\d+)/o) {
-	    debug( "$key has too many hits", 2 ) if DEBUG;
-	    $too_many_hits += $1;
+	if ($prefixes =~ /^(\^)?\001(\d+)/o) {
+	    debug("$key has too many hits", 2 ) if DEBUG;
+	    $too_many_hits += $2;
+	    if ($1) { # use the empty prefix
+		foreach my $k (@$keywords) {
+		    next unless $key =~ /\Q$k\E/;
+		}
+		debug("add key $key", 2) if DEBUG;
+		$pkgs{$key}++;
+		$pkgs_min{$key}++;
+	    }
 	} else {
 	  PREFIX:
 	    foreach (split /\000/o, $prefixes) {
@@ -180,18 +188,21 @@ sub do_names_search {
 		foreach my $k (@$keywords) {
 		    next PREFIX unless $word =~ /\Q$k\E/;
 		}
-		debug( "add word $word", 2) if DEBUG;
+		debug("add word $word", 2) if DEBUG;
 		$pkgs{$word}++;
+		$pkgs_min{$word}++ if $_ eq '';
 	    }
 	}
 	last if $postfixes->seq( $key, $prefixes, R_NEXT ) != 0;
-	last if $too_many_hits or keys %pkgs >= 100;
+	last if keys %pkgs_min >= 100;
     }
-    
-    my $no_results = keys %pkgs;
-    if ($too_many_hits || ($no_results >= 100)) {
-	$too_many_hits += $no_results;
-	%pkgs = ( $first_keyword => 1 ) unless @$keywords;
+
+    my $nr = keys %pkgs;
+    my $min_nr = keys %pkgs_min;
+    debug("nr=$nr min_nr=$min_nr too_many_hits=$too_many_hits", 1) if DEBUG;
+    if ($nr >= 100) {
+	$too_many_hits += $nr - $min_nr + 1;
+	%pkgs = %pkgs_min;
     }
     foreach my $pkg (sort keys %pkgs) {
 	&$read_entry( $packages, $pkg, $results, $non_results, $opts );
-- 
2.39.2