# Convert Contents.gz files into Sleepycat db files for efficient usage of
# data
#
-# $Id$
-#
# Copyright (C) 2006 Jeroen van Wolffelaar <jeroen@wolffelaar.nl>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
use strict;
+use warnings;
use lib './lib';
+$| = 1;
+
# Important, we want sorting and such to happen like in the C locale: binary,
# without any fancy collation. FIXME: is this actually adequate?
$ENV{"LC_ALL"} = 'C';
# temporary files, and is adviced when possible
my $SORT_REVERSE_CONCURRENTLY = 1;
+use English;
use DB_File;
use Storable;
+use File::Path;
+use File::Basename;
+use Packages::CommonCode qw(:all);
use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES @ARCHITECTURES );
&Packages::Config::init( './' );
-my @archives =( 'us'); #@ARCHIVES # NOT-IMPLEMENTED-YET
+my @archives = @ARCHIVES;
my @suites = @SUITES;
my @archs = @ARCHITECTURES;
$DBDIR .= "/contents";
+mkdirp( $DBDIR );
-for my $archive (@archives) { for my $suite (@suites) {
+for my $suite (@suites) {
+ for my $arch (@archs) {
- for my $arch (@archs) {
-
- my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz";
my $filelist_db = "$DBDIR/filelists_${suite}_${arch}.db";
- next unless -f $filename;
- my $ftime = (stat $filename)[10]; # Note: ctime, because mtime is set back via rsync
my $dbtime = (stat $filelist_db)[9];
- next if defined $dbtime and $dbtime > $ftime;
- print "Reading $archive/$suite/$arch...\n";
-
my %packages_contents = ();
my %packages_contents_nr = ();
my %packages_contents_lastword = ();
$extra = "|sort" if $SORT_REVERSE_CONCURRENTLY;
open REVERSED, "$extra>$DBDIR/reverse.tmp"
- or die "Failed to open output reverse file";
-
- open CONT, "zcat $filename|$what";
- while (<CONT>) {last if /^FILE/mo;}
- while (<CONT>) {
- my $data = "";
- my %data = ();
- chomp;
- print "Doing line ".($./1000)."k (out of approx 1.5M)\n" if $. % 250000 == 0;
- /^(.+?)\s+(\S+)$/o;
- my ($file, $value) = ($1, $2);
- $value =~ s#[^,/]+/##og;
- my @packages = split /,/, $value;
- for (@packages) {
+ or die "Failed to open output reverse file: $!";
+
+ my $changed = 0;
+ for my $archive (@archives) {
+
+ my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz";
+ next unless -f $filename;
+ # Note: ctime, because mtime is set back via rsync
+ my $ftime = (stat $filename)[10];
+ next if defined $dbtime and $dbtime > $ftime;
+ print "$archive/$suite/$arch needs update\n";
+ $changed++;
+ }
+ if ($changed) {
+ for my $archive (@archives) {
+
+ my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz";
+ next unless -f $filename;
+ print "Reading $archive/$suite/$arch...\n";
+
+ open CONT, "zcat $filename|$what"
+ or die $!;
+ my $header_found = 0;
+ while (<CONT>) { /^FILE/mo && do { $header_found = 1; last };}
+ if (eof(CONT)) { # no header found or only header found
+ close CONT; # explicit close to reset $.
+ next if $header_found;
+ open CONT, "zcat $filename|$what";
+ }
+ while (<CONT>) {
+ my $data = "";
+ my %data = ();
+ chomp;
+ print "Doing line ".($NR/1000)."k (out of approx 2.0M)\n"
+ if $NR % 250000 == 0;
+ /^(.+?)\s+(\S+)$/o;
+ my ($file, $value) = ($1, $2);
+ $value =~ s#[^,/]+/##og;
+ my @packages = split m/,/, $value;
+ for (@packages) {
$packages_contents_nr{$_}++;
my $lw = $packages_contents_lastword{$_} || "\0";
my $i=0;
$i = 255 if $i > 255;
$packages_contents{$_} .= pack "CC/a*", ($i, substr($file, $i));
$packages_contents_lastword{$_} = "$file\0";
+ }
+ # Searches are case-insensitive
+ (my $nocase = $file) =~ tr [A-Z] [a-z];
+ my $case = ($nocase eq $file) ? '-' : $file;
+
+ print REVERSED (reverse $nocase)."\0".$case."\0".
+ (join ":$arch\0", @packages).":$arch\n";
}
- # Searches are case-insensitive
- $file =~ tr [A-Z] [a-z];
+ close CONT;
- print REVERSED (reverse $file)."\0".(join ":$arch\0", @packages).":$arch\n";
- }
- close CONT;
- close REVERSED;
+ }
+ close REVERSED;
- print "Sorting reverse list if needed\n";
- system("cd $DBDIR && sort reverse.tmp > reverse.sorted && mv reverse.{sorted,tmp}") == 0
- or die "Failed to sort reverse"
- unless $SORT_REVERSE_CONCURRENTLY;
+ print "Sorting reverse list if needed\n";
+ system("cd $DBDIR && sort reverse.tmp > reverse.sorted &&".
+ " mv reverse.{sorted,tmp}") == 0
+ or die "Failed to sort reverse"
+ unless $SORT_REVERSE_CONCURRENTLY;
- print "Writing filelist db\n";
- tie my %packages_contents_db, "DB_File", "$filelist_db.new",
- O_RDWR|O_CREAT, 0666, $DB_BTREE
+ print "Writing filelist db\n";
+ tie my %packages_contents_db, "DB_File", "$filelist_db.new",
+ O_RDWR|O_CREAT, 0666, $DB_BTREE
or die "Error creating DB: $!";
- while (my ($k, $v) = each(%packages_contents)) {
+ while (my ($k, $v) = each(%packages_contents)) {
$packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k})
- . $v;
+ . $v;
+ }
+ untie %packages_contents_db;
+
+ rename("$DBDIR/reverse.tmp", "$DBDIR/reverse_${suite}_${arch}.txt");
+
+ activate($filelist_db);
+ #FIXME: hardcoded archs. (debports has no contrib/non-free)
+ if ($arch !~ m/^kfreebsd-.*$/) {
+ system("ln", "-sf", basename($filelist_db),
+ "$DBDIR/filelists_${suite}_all.db") == 0
+ or die "Oops";
+ }
}
- untie %packages_contents_db;
-
- rename("$DBDIR/reverse.tmp", "$DBDIR/reverse_${suite}_${arch}.txt");
+ }
- rename("$filelist_db.new", $filelist_db);
- system("ln -sf $filelist_db $DBDIR/filelists_${suite}_all.db") == 0
- or die "Oops";
- }
+ my $go = 0;
+ my $suite_mtime = (stat "$DBDIR/reverse_$suite.db")[9];
+ for my $file (glob "$DBDIR/reverse_${suite}_*.txt") {
+ $go = 1 if not defined $suite_mtime
+ or $suite_mtime < (stat $file)[9];
+ }
+ next unless $go;
- print "Merging reverse path lists for ${suite}...\n";
+ print "Merging reverse path lists for ${suite}...\n";
- open MERGED, "sort -m $DBDIR/reverse_${suite}_*.txt |"
+ open MERGED, "-|", "sort -m $DBDIR/reverse_${suite}_*.txt"
or die "Failed to open merged list";
- open FILENAMES, "> $DBDIR/filenames_$suite.txt.new"
+ open FILENAMES, ">", "$DBDIR/filenames_$suite.txt.new"
or die "Failed to open filenames list";
- tie my %reverse_path_db, "DB_File", "$DBDIR/reverse_${suite}.db.new",
- O_RDWR|O_CREAT, 0666, $DB_BTREE
- or die "Error creating DB: $!";
-
- my $lastpath = "";
- my $lastfile = "";
- my @matches = ();
- while (<MERGED>) {
- print "Doing line ".($./1000000)."M (out of approx. 16M)\n" if $. % 1000000 == 0;
+ tie my %reverse_path_db, "DB_File", "$DBDIR/reverse_${suite}.db.new",
+ O_RDWR|O_CREAT, 0666, $DB_BTREE
+ or die "Error creating DB: $!";
+
+ my $lastpath = my $lastcasepath = my $lastfile = "";
+ my %matches = ();
+ while (<MERGED>) {
+ print "Doing line ".($NR/1000000)."M (out of approx. 20M)\n"
+ if $NR % 1000000 == 0;
chomp;
- my @line = split /\0/o, $_;
+ my @line = split m/\0/o, $_;
my $revpath = shift @line;
+ my $casepath = shift @line;
if ($revpath ne $lastpath) {
- # Wrap: Do useful stuff with this ($lastpath, @matches)
- $reverse_path_db{$lastpath} = join "\0", @matches if $lastpath ne "";
- $lastpath =~ s,/.*,,o;
- if ($lastfile ne $lastpath) {
+ # Wrap: Do useful stuff with this ($lastpath, @matches)
+ if ($lastpath ne "") {
+ my @matches;
+ while (my ($k, $v) = each %matches) {
+ push @matches, join("\0", $k, @$v);
+ }
+ $reverse_path_db{$lastpath} = join "\1", @matches;
+ %matches = ();
+ }
+ $lastpath =~ s,/.*,,o;
+ if ($lastfile ne $lastpath) {
$lastfile = $lastpath;
print FILENAMES (reverse $lastfile)."\n";
- }
- #
- $lastpath = $revpath;
- @matches = @line;
- next;
+ }
+ #
+ $lastpath = $revpath;
+ $lastcasepath = $casepath;
+ $matches{$casepath} = \@line;
+ next;
+# } elsif ($lastcasepath ne "" and $casepath ne $lastcasepath) {
+# warn reverse($revpath)." has more than one casepath: $casepath $lastcasepath\n";
}
- push @matches, @line
- }
- # Note: do useful stuff here too, for out last entry. Maybe prevent this by
- # adding a fake ultimate entry?
- $reverse_path_db{$lastpath} = join "\0", @matches;
-
- untie %reverse_path_db;
- close FILENAMES;
- close MERGED;
-
- rename "$DBDIR/filenames_$suite.txt.new", "$DBDIR/filenames_$suite.txt";
- rename "$DBDIR/reverse_$suite.db.new", "$DBDIR/reverse_$suite.db";
-}}
-
-#print "Writing databases...\n";
-
-# FIXME: missing filenames due to optimising above. Need to store filenames
-# per-suite/arch, but merge them in the end for better cached searching
-#open FILENAMES, "> $DBDIR/filenames.txt.new";
-#for (keys %filenames) {
-# print FILENAMES "$_\n";
-#}
-#close FILENAMES;
-
-#rename("$DBDIR/filenames.txt.new", "$DBDIR/filenames.txt");
+ push @{$matches{$casepath}}, @line;
+ }
+ # Note: do useful stuff here too, for out last entry. Maybe prevent this by
+ # adding a fake ultimate entry?
+ {
+ my @matches;
+ while (my ($k, $v) = each %matches) {
+ push @matches, join("\0", $k, @$v);
+ }
+ $reverse_path_db{$lastpath} = join "\1", @matches;
+ }
+
+ untie %reverse_path_db;
+ close FILENAMES;
+ close MERGED;
+
+ activate("$DBDIR/filenames_$suite.txt");
+ activate("$DBDIR/reverse_$suite.db");
+}
# vim: set ts=4