I will definetly stick with xapian for now.
| key: "packagename version arch"
| value: a unique description id, did
-| descriptions.txt:
-| on each line:
-| description with strange characters mangled for proper substring
-| searching, linenumber being the did
-
| descriptions.db:
| key: did
| value: description, first line being short, the rest being long [no
matches" and abce: 90 matches"
- fulltext search:
- - Max 100 results
- - Better exact=1 performance by indexing per word?
- - drop case-sensitive from options, descriptions.txt all lowercase and without
- punctuation, such that instead of =~ //, indexof can be used
- in results, show full descriptions, so one sees what's being matched?
- backend:
tie %descriptions_db, "DB_File", "$DBDIR/descriptions.db.new",
O_RDWR|O_CREAT, 0666, $DB_BTREE
or die "Error creating DB: $!";
-open DESCR, ">", "$DBDIR/descriptions.txt" or die "Error creating descriptions textfile";
print "Index $#descriptions descriptions\n";
for (my $i=1; $i<= $#descriptions; $i++) {
my $plain_description = $descriptions[$i];
$plain_description = " $plain_description ";
$plain_description =~ s/[(),.-]+//og;
$plain_description =~ s#[^a-z0-9_/+]+# #og;
- print DESCR "$plain_description\n";
#XAPIAN
eval {
$descriptions_db{$i} = $only_desc;
}
-close DESCR;
untie %descriptions_db;
$xapian_db->flush;
undef $xapian_db;
"$DBDIR/packages_descriptions.db");
rename("$DBDIR/descriptions_packages.db.new",
"$DBDIR/descriptions_packages.db");
-rename("$DBDIR/descriptions.txt.new", "$DBDIR/descriptions.txt");
rename("$DBDIR/descriptions.db.new", "$DBDIR/descriptions.db");
rename("$DBDIR/package_postfixes.db.new", "$DBDIR/package_postfixes.db");
do_names_search( [ @keywords ], \%packages, $p_obj,
\&read_entry_all, $opts,
\@results, \@non_results );
-# my $fts0 = new Benchmark;
-# do_fulltext_search( [ @keywords ], "$DBDIR/descriptions.txt",
-# \%did2pkg, \%packages,
-# \&read_entry_all, $opts,
-# \@results, \@non_results );
my $fts1 = new Benchmark;
do_xapian_search( [ @keywords ], "$DBDIR/xapian/",
\%did2pkg, \%packages,
\&read_entry_all, $opts,
\@results, \@non_results );
my $fts2 = new Benchmark;
-# my $fts_grep = timediff($fts1,$fts0);
my $fts_xapian = timediff($fts2,$fts1);
-# debug( "Fulltext search took ".timestr($fts_grep)." (grep)" ) if DEBUG;
- debug( "Fulltext search took ".timestr($fts_xapian)." (Xapian)" )
+ debug( "Fulltext search took ".timestr($fts_xapian) )
if DEBUG;
}
}
&$read_entry( $packages, $pkg, $results, $non_results, $opts );
}
}
-sub do_fulltext_search {
- my ($keywords, $file, $did2pkg, $packages, $read_entry, $opts,
- $results, $non_results) = @_;
-
-# NOTE: this needs to correspond with parse-packages!
- my @tmp;
- foreach my $keyword (@$keywords) {
- $keyword =~ tr [A-Z] [a-z];
- if ($opts->{exact}) {
- $keyword = " $keyword ";
- }
- $keyword =~ s/[(),.-]+//og;
- $keyword =~ s;[^a-z0-9_/+]+; ;og;
- push @tmp, $keyword;
- }
- my $first_keyword = shift @tmp;
- @$keywords = @tmp;
-
- my $numres = 0;
- my %tmp_results;
- # fgrep is seriously faster than using perl
- open DESC, '-|', 'fgrep', '-n', '--', $first_keyword, $file
- or die "couldn't open $file: $!";
- LINE:
- while (<DESC>) {
- foreach my $k (@$keywords) {
- next LINE unless /\Q$k\E/;
- }
- /^(\d+)/;
- my $nr = $1;
- debug( "Matched line $_", 2) if DEBUG;
- my $result = $did2pkg->{$nr};
- foreach (split /\000/o, $result) {
- my @data = split /\s/, $_, 3;
-# debug ("Considering $data[0], arch = $data[2]", 3) if DEBUG;
-# next unless $data[2] eq 'all' || $opts->{h_archs}{$data[2]};
-# debug ("Ok", 3) if DEBUG;
- $numres++ unless $tmp_results{$data[0]}++;
- }
- last if $numres > 100;
- }
- close DESC;
- $too_many_hits++ if $numres > 100;
-
- my @results;
- foreach my $pkg (keys %tmp_results) {
- &$read_entry( $packages, $pkg, $results, $non_results, $opts );
- }
- }
sub do_xapian_search {
my ($keywords, $db, $did2pkg, $packages, $read_entry, $opts,