X-Git-Url: https://git.deb.at/?p=deb%2Fpackages.git;a=blobdiff_plain;f=bin%2Fparse-packages;h=64a00cdf125d897e2d039f1746dffc94ab43f15b;hp=e441515c8708bbfcb59939630c611c69a07caab8;hb=d02a2656068707e6928642c632f553c8c8770ea9;hpb=a3e6aa1e34000d0a7faa1773bd6bc32c32cfa625 diff --git a/bin/parse-packages b/bin/parse-packages index e441515..64a00cd 100755 --- a/bin/parse-packages +++ b/bin/parse-packages @@ -31,7 +31,11 @@ my $MAX_PACKAGE_POSTFIXES = 100; use DB_File; use Storable; +use File::Path; +use Digest::MD5; use Deb::Versions; +use Lingua::Stem v0.82; +use Search::Xapian; use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES ); &Packages::Config::init( './' ); my %packages_small = (); @@ -50,15 +54,24 @@ my %priorities = (); $/ = ""; -for my $archive (@ARCHIVES) { - for my $suite (@SUITES) { +-d $DBDIR || mkpath( $DBDIR ); +-d "$DBDIR/xapian.new" && rmtree("$DBDIR/xapian.new"); +-d "$DBDIR/xapian.old" && rmtree("$DBDIR/xapian.old"); +mkpath( "$DBDIR/xapian.new" ); - my %package_names_suite = (); +for my $suite (@SUITES) { + my %package_names_suite = (); + my %packages_all_db; + tie %packages_all_db, "DB_File", "$DBDIR/packages_all_$suite.db.new", + O_RDWR|O_CREAT, 0666, $DB_BTREE + or die "Error creating DB: $!"; + + for my $archive (@ARCHIVES) { print "Reading $archive/$suite...\n"; - my %packages_all_db; - tie %packages_all_db, "DB_File", "$DBDIR/packages_all_$suite.db.new", - O_RDWR|O_CREAT, 0666, $DB_BTREE - or die "Error creating DB: $!"; + if (!-d "$TOPDIR/archive/$archive/$suite/") { + print "\tseems not to exist, skipping...\n"; + next; + } open PKG, "zcat $TOPDIR/archive/$archive/$suite/$what/{,debian-installer/}binary-*/Packages.gz|"; while () { next if /^\s*$/; @@ -74,10 +87,15 @@ for my $archive (@ARCHIVES) { } # Skip double package next if exists($packages_all_db{"$data{'package'} $data{'architecture'} $data{'version'}"}); + # Skip arch:all for amd64 & gnuab, any non-redundancy is + # usually a bug anyway + next if ($archive eq 'amd64' or $archive eq 'gnuab') + and $data{architecture} eq 'all'; if ($data{'provides'}) { foreach (split /\s*,\s*/, $data{'provides'}) { $virtual_packages{$_}{$suite}{$data{'package'}}++; + $packages_small{$_} ||= {}; } } $package_names{$data{'package'}} = 1; @@ -86,10 +104,33 @@ for my $archive (@ARCHIVES) { my $src_version = ''; if ($data{'source'}) { $src = $data{'source'}; - $src =~ s/\s+.*//; # strip version info + $src =~ s/\s+.*//o; # strip version info } $data{'source'} = $src; - my $descr = $data{'description'}; + + # expand tags like devel::{lang:c,lang:c++} + if ($data{'tag'} && $data{'tag'} =~ /\{/) { + my @complete_tags = split(/, /, $data{'tag'}); + my @tags; + foreach (@complete_tags) { + my ($facet, $tag) = split( /::/, $_, 2); + if ($tag =~ s/^\{(.+)\}$/$1/) { + foreach (split( /,/, $tag )) { + push @tags, "${facet}::$_"; + } + } else { + push @tags, "${facet}::$tag"; + } + } + my $old = $data{tag}; + $data{'tag'} = join ", ", @tags; + } + + # we add some additional data here + my $descr = "$data{'description'}\000$data{'package'}\000" + .($data{'tag'}||''); + my $sdescr = $data{'description'}; + $sdescr =~ s/\n.*//os; my $did = undef; if (exists($descriptions{$descr})) { $did = $descriptions{$descr}; @@ -98,13 +139,12 @@ for my $archive (@ARCHIVES) { $descriptions[$did] = $descr; $descriptions{$descr} = $did; } + $data{'description-md5'} = Digest::MD5::md5_hex($data{'description'}, "\n"); $data{'description'} = $did; $packages_descriptions{"$data{'package'} $data{'version'} $data{'architecture'}"} = $did; $descriptions_packages{$did} .= "$data{'package'} $data{'version'} $data{'architecture'}\000"; - my $sdescr = $descr; - $sdescr =~ s/\n.*//s; my $section = 'main'; my $subsection = $data{section} || '-'; if ($data{section} && ($data{section} =~ m=/=o)) { @@ -115,19 +155,19 @@ for my $archive (@ARCHIVES) { $data{'section'} = $section; $data{'subsection'} = $subsection; $data{'priority'} ||= '-'; - $sections{$suite}{$archive}{$section}++; - $subsections{$suite}{$archive}{$subsection}++; - $priorities{$suite}{$archive}{$data{priority}}++; + $sections{$suite}{$section}++; + $subsections{$suite}{$subsection}++; + $priorities{$suite}{$data{priority}}++; my $pkgitem = "$archive $suite $data{'architecture'} ". "$section $subsection $data{'priority'} $data{'version'} $sdescr\0"; - my $previtem = $packages_small{$data{'package'}}{$suite}{$data{'architecture'}} - || $pkgitem; + my $previtem = ($packages_small{$data{'package'}}{$suite}{$data{'architecture'}} + ||= $pkgitem); $packages_small{$data{'package'}}{$suite}{$data{'architecture'}} = $pkgitem - if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) >= 0; - $previtem = $packages_small{$data{'package'}}{$suite}{'any'} - || $pkgitem; + if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) > 0; + $previtem = ($packages_small{$data{'package'}}{$suite}{'any'} + ||= $pkgitem); $packages_small{$data{'package'}}{$suite}{'any'} = $pkgitem - if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) >= 0; + if version_cmp($data{'version'}, (split /\s/o, $previtem)[6]) > 0; $sources_packages{$src} .= "$archive $suite $data{'package'} $data{'version'} $data{'architecture'}\000"; $data{archive} = $archive; @@ -141,16 +181,16 @@ for my $archive (@ARCHIVES) { $packages_all_db{"$data{'package'} $data{'architecture'} $data{'version'}"} = $data; } + } - open NAMES, '>>', "$DBDIR/package_names_$suite.txt.new" - or die "Error creating package names list: $!"; - foreach (sort keys %package_names_suite) { - print NAMES "$_\n"; - } - close NAMES; - - untie %packages_all_db; + open NAMES, '>', "$DBDIR/package_names_$suite.txt.new" + or die "Error creating package names list: $!"; + foreach (sort keys %package_names_suite) { + print NAMES "$_\n"; } + close NAMES; + + untie %packages_all_db; } print "Writing databases...\n"; @@ -167,7 +207,7 @@ while (my ($pkg, $v) = each(%packages_small)) { $res3 .= $v3; } } - + if (exists $virtual_packages{$pkg}) { while (my ($suite, $v2) = each %{$virtual_packages{$pkg}}) { $res1 .= "$suite\01".(join ' ', keys %$v2)."\01"; @@ -212,13 +252,24 @@ while (my ($k, $v) = each(%descriptions_packages)) { } untie %descriptions_packages_db; +my $stemmer = Lingua::Stem->new(); +$stemmer->stem_caching({ -level => 2 }); +my $xapian_db; +eval { + $xapian_db = Search::Xapian::WritableDatabase->new("$DBDIR/xapian.new", + Search::Xapian::DB_CREATE_OR_OPEN) + or die "can't create write-able db object: $!\n"; +}; +die $@ if $@; my %descriptions_db; tie %descriptions_db, "DB_File", "$DBDIR/descriptions.db.new", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Error creating DB: $!"; -open DESCR, ">", "$DBDIR/descriptions.txt" or die "Error creating descriptions textfile"; +print "Index $#descriptions descriptions\n"; for (my $i=1; $i<= $#descriptions; $i++) { my $plain_description = $descriptions[$i]; + # strip away additional data + my ($only_desc) = split /\000/o, $plain_description, 2; # WARNING: This needs to correspond with what happens in # Packages/Search.pm:do_fulltext_search $plain_description =~ tr [A-Z] [a-z]; @@ -226,11 +277,32 @@ for (my $i=1; $i<= $#descriptions; $i++) { $plain_description = " $plain_description "; $plain_description =~ s/[(),.-]+//og; $plain_description =~ s#[^a-z0-9_/+]+# #og; - print DESCR "$plain_description\n"; - $descriptions_db{$i} = $descriptions[$i]; + + #XAPIAN + eval { + my @words = split /\s+/, $plain_description; + my $stem_words = $stemmer->stem( \@words ); + my $doc = Search::Xapian::Document->new() + or die "can't create doc object for $i: $!\n"; + if ($doc->set_data($i)){ + warn "can't set_data in doc object for $i: $!\n"; + } + for my $j (0 .. (@$stem_words-1)) { + next if $stem_words->[$j] =~ /^\s*$/o; + if ($doc->add_posting($stem_words->[$j], $j)) { + warn "can't add word $stem_words->[$j] $j: $!\n"; + } + } + $xapian_db->add_document($doc) + or warn "failed to add document: $i\n"; + }; + die $@ if $@; + + $descriptions_db{$i} = $only_desc; } -close DESCR; untie %descriptions_db; +$xapian_db->flush; +undef $xapian_db; # package names stuff: for my $pkg (keys %package_names) { @@ -269,10 +341,12 @@ for my $suite (@SUITES) { rename("$DBDIR/package_names_$suite.txt.new", "$DBDIR/package_names_$suite.txt"); } +rename("$DBDIR/xapian", "$DBDIR/xapian.old"); +rename("$DBDIR/xapian.new","$DBDIR/xapian"); +rmtree("$DBDIR/xapian.old"); rename("$DBDIR/packages_descriptions.db.new", "$DBDIR/packages_descriptions.db"); rename("$DBDIR/descriptions_packages.db.new", "$DBDIR/descriptions_packages.db"); -rename("$DBDIR/descriptions.txt.new", "$DBDIR/descriptions.txt"); rename("$DBDIR/descriptions.db.new", "$DBDIR/descriptions.db"); rename("$DBDIR/package_postfixes.db.new", "$DBDIR/package_postfixes.db");