From: Frank Lichtenheld Date: Sat, 14 Apr 2007 00:18:37 +0000 (+0200) Subject: Description Search: Move to Xapian X-Git-Url: https://git.deb.at/w?a=commitdiff_plain;h=bf6d5b1c3221cdd54d613778ce806804a3faf006;p=deb%2Fpackages.git Description Search: Move to Xapian Use Xapian as Backend for fulltext search in descriptions. Introduces new do_xapian_search function to be able to switch to the old do_fulltext_search in case of problems. --- diff --git a/bin/parse-packages b/bin/parse-packages index 189ea02..fa81fd4 100755 --- a/bin/parse-packages +++ b/bin/parse-packages @@ -33,6 +33,8 @@ use DB_File; use Storable; use File::Path; use Deb::Versions; +use Lingua::Stem v0.82; +use Search::Xapian; use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES ); &Packages::Config::init( './' ); my %packages_small = (); @@ -52,6 +54,9 @@ my %priorities = (); $/ = ""; -d $DBDIR || mkpath( $DBDIR ); +-d "$DBDIR/xapian.new" && rmtree("$DBDIR/xapian.new"); +-d "$DBDIR/xapian.old" && rmtree("$DBDIR/xapian.old"); +mkpath( "$DBDIR/xapian.new" ); for my $suite (@SUITES) { my %package_names_suite = (); @@ -97,10 +102,14 @@ for my $suite (@SUITES) { my $src_version = ''; if ($data{'source'}) { $src = $data{'source'}; - $src =~ s/\s+.*//; # strip version info + $src =~ s/\s+.*//o; # strip version info } $data{'source'} = $src; - my $descr = $data{'description'}; + # we add some additional data here + my $descr = "$data{'description'}\000$data{'package'}\000" + .($data{'tag'}||''); + my $sdescr = $data{'description'}; + $sdescr =~ s/\n.*//os; my $did = undef; if (exists($descriptions{$descr})) { $did = $descriptions{$descr}; @@ -114,8 +123,6 @@ for my $suite (@SUITES) { $descriptions_packages{$did} .= "$data{'package'} $data{'version'} $data{'architecture'}\000"; - my $sdescr = $descr; - $sdescr =~ s/\n.*//s; my $section = 'main'; my $subsection = $data{section} || '-'; if ($data{section} && ($data{section} =~ m=/=o)) { @@ -178,7 +185,7 @@ while (my ($pkg, $v) = each(%packages_small)) { $res3 .= $v3; } } - + if (exists $virtual_packages{$pkg}) { while (my ($suite, $v2) = each %{$virtual_packages{$pkg}}) { $res1 .= "$suite\01".(join ' ', keys %$v2)."\01"; @@ -223,13 +230,25 @@ while (my ($k, $v) = each(%descriptions_packages)) { } untie %descriptions_packages_db; +my $stemmer = Lingua::Stem->new(); +$stemmer->stem_caching({ -level => 2 }); +my $xapian_db; +eval { + $xapian_db = Search::Xapian::WritableDatabase->new("$DBDIR/xapian.new", + Search::Xapian::DB_CREATE_OR_OPEN) + or die "can't create write-able db object: $!\n"; +}; +die $@ if $@; my %descriptions_db; tie %descriptions_db, "DB_File", "$DBDIR/descriptions.db.new", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Error creating DB: $!"; open DESCR, ">", "$DBDIR/descriptions.txt" or die "Error creating descriptions textfile"; +print "Index $#descriptions descriptions\n"; for (my $i=1; $i<= $#descriptions; $i++) { my $plain_description = $descriptions[$i]; + # strip away additional data + my ($only_desc) = split /\000/o, $plain_description, 2; # WARNING: This needs to correspond with what happens in # Packages/Search.pm:do_fulltext_search $plain_description =~ tr [A-Z] [a-z]; @@ -238,10 +257,33 @@ for (my $i=1; $i<= $#descriptions; $i++) { $plain_description =~ s/[(),.-]+//og; $plain_description =~ s#[^a-z0-9_/+]+# #og; print DESCR "$plain_description\n"; - $descriptions_db{$i} = $descriptions[$i]; + + #XAPIAN + eval { + my @words = split /\s+/, $plain_description; + my $stem_words = $stemmer->stem( \@words ); + my $doc = Search::Xapian::Document->new() + or die "can't create doc object for $i: $!\n"; + if ($doc->set_data($i)){ + warn "can't set_data in doc object for $i: $!\n"; + } + for my $j (0 .. (@$stem_words-1)) { + next if $stem_words->[$j] =~ /^\s*$/o; + if ($doc->add_posting($stem_words->[$j], $j)) { + warn "can't add word $stem_words->[$j] $j: $!\n"; + } + } + $xapian_db->add_document($doc) + or warn "failed to add document: $i\n"; + }; + die $@ if $@; + + $descriptions_db{$i} = $only_desc; } close DESCR; untie %descriptions_db; +$xapian_db->flush; +undef $xapian_db; # package names stuff: for my $pkg (keys %package_names) { @@ -280,6 +322,9 @@ for my $suite (@SUITES) { rename("$DBDIR/package_names_$suite.txt.new", "$DBDIR/package_names_$suite.txt"); } +rename("$DBDIR/xapian", "$DBDIR/xapian.old"); +rename("$DBDIR/xapian.new","$DBDIR/xapian"); +rmtree("$DBDIR/xapian.old"); rename("$DBDIR/packages_descriptions.db.new", "$DBDIR/packages_descriptions.db"); rename("$DBDIR/descriptions_packages.db.new", diff --git a/lib/Packages/DoSearch.pm b/lib/Packages/DoSearch.pm index 2603ff9..a94dd7b 100644 --- a/lib/Packages/DoSearch.pm +++ b/lib/Packages/DoSearch.pm @@ -53,10 +53,22 @@ sub do_search { do_names_search( [ @keywords ], \%packages, $p_obj, \&read_entry_all, $opts, \@results, \@non_results ); - do_fulltext_search( [ @keywords ], "$DBDIR/descriptions.txt", +# my $fts0 = new Benchmark; +# do_fulltext_search( [ @keywords ], "$DBDIR/descriptions.txt", +# \%did2pkg, \%packages, +# \&read_entry_all, $opts, +# \@results, \@non_results ); + my $fts1 = new Benchmark; + do_xapian_search( [ @keywords ], "$DBDIR/xapian/", \%did2pkg, \%packages, \&read_entry_all, $opts, \@results, \@non_results ); + my $fts2 = new Benchmark; +# my $fts_grep = timediff($fts1,$fts0); + my $fts_xapian = timediff($fts2,$fts1); +# debug( "Fulltext search took ".timestr($fts_grep)." (grep)" ) if DEBUG; + debug( "Fulltext search took ".timestr($fts_xapian)." (Xapian)" ) + if DEBUG; } } diff --git a/lib/Packages/Search.pm b/lib/Packages/Search.pm index c6a521a..1c36986 100644 --- a/lib/Packages/Search.pm +++ b/lib/Packages/Search.pm @@ -46,6 +46,8 @@ use warnings; use POSIX; use HTML::Entities; use DB_File; +use Lingua::Stem v0.82; +use Search::Xapian qw(:ops); use Deb::Versions; use Packages::CGI; @@ -55,7 +57,7 @@ our @ISA = qw( Exporter ); our @EXPORT_OK = qw( read_entry read_entry_all read_entry_simple read_src_entry read_src_entry_all find_binaries - do_names_search do_fulltext_search + do_names_search do_fulltext_search do_xapian_search ); our %EXPORT_TAGS = ( all => [ @EXPORT_OK ] ); @@ -242,6 +244,52 @@ sub do_fulltext_search { } } +sub do_xapian_search { + my ($keywords, $db, $did2pkg, $packages, $read_entry, $opts, + $results, $non_results) = @_; + +# NOTE: this needs to correspond with parse-packages! + my @tmp; + foreach my $keyword (@$keywords) { + $keyword =~ tr [A-Z] [a-z]; + if ($opts->{exact}) { + $keyword = " $keyword "; + } + $keyword =~ s/[(),.-]+//og; + $keyword =~ s;[^a-z0-9_/+]+; ;og; + push @tmp, $keyword; + } + my $stemmer = Lingua::Stem->new(); + $keywords = $stemmer->stem( @tmp ); + + my $db = Search::Xapian::Database->new( $db ); + my $enq = $db->enquire( OP_AND, @$keywords ); + debug( "Xapian Query was: ".$enq->get_query()->get_description(), 1) if DEBUG; + my @matches = $enq->matches(0, 100); + + my $numres = 0; + my %tmp_results; + foreach my $match ( @matches ) { + my $id = $match->get_docid(); + my $result = $did2pkg->{$id}; + + foreach (split /\000/o, $result) { + my @data = split /\s/, $_, 3; +# debug ("Considering $data[0], arch = $data[2]", 3) if DEBUG; +# next unless $data[2] eq 'all' || $opts->{h_archs}{$data[2]}; +# debug ("Ok", 3) if DEBUG; + $numres++ unless $tmp_results{$data[0]}++; + } + last if $numres > 100; + } + undef $db; + $too_many_hits++ if $numres > 100; + + foreach my $pkg (keys %tmp_results) { + &$read_entry( $packages, $pkg, $results, $non_results, $opts ); + } +} + sub find_binaries { my ($pkg, $archive, $suite, $src2bin) = @_;