]> git.deb.at Git - deb/packages.git/blobdiff - bin/parse-packages
setup-site: Only search for files to work on two directories deep
[deb/packages.git] / bin / parse-packages
index b3aab25fff2ee63b7e779edb86a2928023cba511..fa81fd400c60676140a79c7e8cbd43b42ef87904 100755 (executable)
@@ -20,6 +20,7 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 use strict;
+use warnings;
 use lib './lib';
 
 $| = 1;
@@ -30,7 +31,10 @@ my $MAX_PACKAGE_POSTFIXES = 100;
 
 use DB_File;
 use Storable;
+use File::Path;
 use Deb::Versions;
+use Lingua::Stem v0.82;
+use Search::Xapian;
 use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES );
 &Packages::Config::init( './' );
 my %packages_small = ();
@@ -49,14 +53,24 @@ my %priorities = ();
 
 $/ = "";
 
-for my $archive (@ARCHIVES) {
-    for my $suite (@SUITES) {
+-d $DBDIR || mkpath( $DBDIR );
+-d "$DBDIR/xapian.new" && rmtree("$DBDIR/xapian.new");
+-d "$DBDIR/xapian.old" && rmtree("$DBDIR/xapian.old");
+mkpath( "$DBDIR/xapian.new" );
 
+for my $suite (@SUITES) {
+    my %package_names_suite = ();
+    my %packages_all_db;
+    tie %packages_all_db, "DB_File", "$DBDIR/packages_all_$suite.db.new",
+           O_RDWR|O_CREAT, 0666, $DB_BTREE
+           or die "Error creating DB: $!";
+
+    for my $archive (@ARCHIVES) {
        print "Reading $archive/$suite...\n";
-       my %packages_all_db;
-       tie %packages_all_db, "DB_File", "$DBDIR/packages_all_$suite.db.new",
-               O_RDWR|O_CREAT, 0666, $DB_BTREE
-               or die "Error creating DB: $!";
+       if (!-d "$TOPDIR/archive/$archive/$suite/") {
+                print "\tseems not to exist, skipping...\n";
+                next;
+        }
        open PKG, "zcat $TOPDIR/archive/$archive/$suite/$what/{,debian-installer/}binary-*/Packages.gz|";
        while (<PKG>) {
                next if /^\s*$/;
@@ -72,21 +86,30 @@ for my $archive (@ARCHIVES) {
                }
                # Skip double package
                next if exists($packages_all_db{"$data{'package'} $data{'architecture'} $data{'version'}"});
+               # Skip arch:all for amd64&kfreebsd, too often broken
+               next if ($archive eq 'amd64' or $archive eq 'kfreebsd')
+                   and $data{architecture} eq 'all';
 
                if ($data{'provides'}) {
                    foreach (split /\s*,\s*/, $data{'provides'}) {
                        $virtual_packages{$_}{$suite}{$data{'package'}}++;
+                       $packages_small{$_} ||= {};
                    }
                }
                $package_names{$data{'package'}} = 1;
+               $package_names_suite{$data{'package'}} = 1;
                my $src = $data{'package'};
                my $src_version = '';
                if ($data{'source'}) {
                        $src = $data{'source'};
-                       $src =~ s/\s+.*//; # strip version info
+                       $src =~ s/\s+.*//o; # strip version info
                }
                $data{'source'} = $src;
-               my $descr = $data{'description'};
+               # we add some additional data here
+               my $descr = "$data{'description'}\000$data{'package'}\000"
+                   .($data{'tag'}||'');
+               my $sdescr = $data{'description'};
+               $sdescr =~ s/\n.*//os;
                my $did = undef;
                if (exists($descriptions{$descr})) {
                        $did  = $descriptions{$descr};
@@ -100,8 +123,6 @@ for my $archive (@ARCHIVES) {
                $descriptions_packages{$did} .=
                        "$data{'package'} $data{'version'} $data{'architecture'}\000";
 
-               my $sdescr = $descr;
-               $sdescr =~ s/\n.*//s;
                my $section = 'main';
                my $subsection = $data{section} || '-';
                if ($data{section} && ($data{section} =~ m=/=o)) {
@@ -112,19 +133,19 @@ for my $archive (@ARCHIVES) {
                $data{'section'} = $section;
                $data{'subsection'} = $subsection;
                $data{'priority'} ||= '-';
-               $sections{$suite}{$archive}{$section}++;
-               $subsections{$suite}{$archive}{$subsection}++;
-               $priorities{$suite}{$archive}{$data{priority}}++;
+               $sections{$suite}{$section}++;
+               $subsections{$suite}{$subsection}++;
+               $priorities{$suite}{$data{priority}}++;
                my $pkgitem = "$archive $suite $data{'architecture'} ".
                        "$section $subsection $data{'priority'} $data{'version'} $sdescr\0";
-               my $previtem = $packages_small{$data{'package'}}{$suite}{$data{'architecture'}}
-                   || $pkgitem;
+               my $previtem = ($packages_small{$data{'package'}}{$suite}{$data{'architecture'}}
+                   ||= $pkgitem);
                $packages_small{$data{'package'}}{$suite}{$data{'architecture'}} = $pkgitem
-                   if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) >= 0;
-               $previtem = $packages_small{$data{'package'}}{$suite}{'any'}
-                   || $pkgitem;
+                   if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) > 0;
+               $previtem = ($packages_small{$data{'package'}}{$suite}{'any'}
+                   ||= $pkgitem);
                $packages_small{$data{'package'}}{$suite}{'any'} = $pkgitem
-                   if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) >= 0;
+                   if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) > 0;
                $sources_packages{$src} .=
                        "$archive $suite $data{'package'} $data{'version'} $data{'architecture'}\000";
                $data{archive} = $archive;
@@ -138,9 +159,16 @@ for my $archive (@ARCHIVES) {
                $packages_all_db{"$data{'package'} $data{'architecture'} $data{'version'}"}
                        = $data;
        }
+    }
 
-       untie %packages_all_db;
+    open NAMES, '>', "$DBDIR/package_names_$suite.txt.new"
+       or die "Error creating package names list: $!";
+    foreach (sort keys %package_names_suite) {
+       print NAMES "$_\n";
     }
+    close NAMES;
+
+    untie %packages_all_db;
 }
 
 print "Writing databases...\n";
@@ -157,7 +185,7 @@ while (my ($pkg, $v) = each(%packages_small)) {
                $res3 .= $v3;
            }
        }
-       
+
        if (exists $virtual_packages{$pkg}) {
                while (my ($suite, $v2) = each %{$virtual_packages{$pkg}}) {
                        $res1 .= "$suite\01".(join ' ', keys %$v2)."\01";
@@ -202,13 +230,25 @@ while (my ($k, $v) = each(%descriptions_packages)) {
 }
 untie %descriptions_packages_db;
 
+my $stemmer = Lingua::Stem->new();
+$stemmer->stem_caching({ -level => 2 });
+my $xapian_db;
+eval {
+    $xapian_db = Search::Xapian::WritableDatabase->new("$DBDIR/xapian.new",
+                                                      Search::Xapian::DB_CREATE_OR_OPEN)
+       or die "can't create write-able db object: $!\n";
+};
+die $@ if $@;
 my %descriptions_db;
 tie %descriptions_db, "DB_File", "$DBDIR/descriptions.db.new",
        O_RDWR|O_CREAT, 0666, $DB_BTREE
        or die "Error creating DB: $!";
 open DESCR, ">", "$DBDIR/descriptions.txt" or die "Error creating descriptions textfile";
+print "Index $#descriptions descriptions\n";
 for (my $i=1; $i<= $#descriptions; $i++) {
        my $plain_description = $descriptions[$i];
+       # strip away additional data
+       my ($only_desc) = split /\000/o, $plain_description, 2;
 # WARNING: This needs to correspond with what happens in
 # Packages/Search.pm:do_fulltext_search
        $plain_description =~ tr [A-Z] [a-z];
@@ -217,10 +257,33 @@ for (my $i=1; $i<= $#descriptions; $i++) {
        $plain_description =~ s/[(),.-]+//og;
        $plain_description =~ s#[^a-z0-9_/+]+# #og;
        print DESCR "$plain_description\n";
-       $descriptions_db{$i} = $descriptions[$i];
+
+       #XAPIAN
+       eval {
+           my @words = split /\s+/, $plain_description;
+           my $stem_words = $stemmer->stem( \@words );
+           my $doc = Search::Xapian::Document->new()
+               or die "can't create doc object for $i: $!\n";
+           if ($doc->set_data($i)){
+               warn "can't set_data in doc object for $i: $!\n";
+           }
+           for my $j (0 .. (@$stem_words-1)) {
+               next if $stem_words->[$j] =~ /^\s*$/o;
+               if ($doc->add_posting($stem_words->[$j], $j)) {
+                   warn "can't add word $stem_words->[$j] $j: $!\n";
+               }
+           }
+           $xapian_db->add_document($doc)
+               or warn "failed to add document: $i\n";
+       };
+       die $@ if $@;
+
+       $descriptions_db{$i} = $only_desc;
 }
 close DESCR;
 untie %descriptions_db;
+$xapian_db->flush;
+undef $xapian_db;
 
 # package names stuff:
 for my $pkg (keys %package_names) {
@@ -256,7 +319,12 @@ rename("$DBDIR/sources_packages.db.new", "$DBDIR/sources_packages.db");
 for my $suite (@SUITES) {
        rename("$DBDIR/packages_all_$suite.db.new",
               "$DBDIR/packages_all_$suite.db");
+       rename("$DBDIR/package_names_$suite.txt.new",
+              "$DBDIR/package_names_$suite.txt");
 }
+rename("$DBDIR/xapian", "$DBDIR/xapian.old");
+rename("$DBDIR/xapian.new","$DBDIR/xapian");
+rmtree("$DBDIR/xapian.old");
 rename("$DBDIR/packages_descriptions.db.new",
        "$DBDIR/packages_descriptions.db");
 rename("$DBDIR/descriptions_packages.db.new",