X-Git-Url: https://git.deb.at/?p=deb%2Fpackages.git;a=blobdiff_plain;f=bin%2Fparse-contents;h=11d076f49f2799a1f17c747abc5e72e4792bf63f;hp=ceb0cb0e5bce0eea3b958b93d6e5dfc4bc02a2d4;hb=0f318fa9ad9d473b543a48f46a7714a11283d300;hpb=eca3f1d1d3ddba9041b4cae9cc7c54bcb95cc3ef diff --git a/bin/parse-contents b/bin/parse-contents index ceb0cb0..11d076f 100755 --- a/bin/parse-contents +++ b/bin/parse-contents @@ -2,8 +2,6 @@ # Convert Contents.gz files into Sleepycat db files for efficient usage of # data # -# $Id$ -# # Copyright (C) 2006 Jeroen van Wolffelaar # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,40 +15,92 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. use strict; +use warnings; use lib './lib'; -my $what = $ARGV[0] ? "head -100000|" : ""; +$| = 1; + +# Important, we want sorting and such to happen like in the C locale: binary, +# without any fancy collation. FIXME: is this actually adequate? +$ENV{"LC_ALL"} = 'C'; + +my $what = $ARGV[0] ? "head -10000|" : ""; + +# More RAM vs more disk I/O tradeoff parameters, does not change +# functionality. True will always use more RAM at the benefit of less +# temporary files, and is adviced when possible +my $SORT_REVERSE_CONCURRENTLY = 1; +use English; use DB_File; use Storable; -use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES ); +use File::Path; +use File::Basename; +use Packages::CommonCode qw(:all); +use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES @ARCHITECTURES ); &Packages::Config::init( './' ); -my %packages_contents = (); -my %packages_contents_nr = (); -my %packages_contents_lastword = (); -my %file_reverse = (); - -my @archives =( 'us'); #@ARCHIVES -my @suites = ('stable');#@SUITES - -for my $archive (@archives) { for my $suite (@suites) { - - print "Reading $archive/$suite/i386...\n"; - open CONT, "zcat /org/ftp.debian.org/ftp/dists/stable/Contents-i386.gz|$what"; - while (1) {$_ = ;last if /^FILE/mo;} - while () { - my $data = ""; - my %data = (); - chomp; - print "Doing line $.\n" if $. % 10000 == 0; - /^(\S+)\s+(\S+)/; - my ($file, $value) = ($1, $2); - $value =~ s#[^,/]+/##g; - my @packages = split /,/, $value; - for (@packages) { + +my @archives = @ARCHIVES; +my @suites = @SUITES; +my @archs = @ARCHITECTURES; + +$DBDIR .= "/contents"; +mkdirp( $DBDIR ); + +for my $suite (@suites) { + for my $arch (@archs) { + + my $filelist_db = "$DBDIR/filelists_${suite}_${arch}.db"; + my $dbtime = (stat $filelist_db)[9]; + my %packages_contents = (); + my %packages_contents_nr = (); + my %packages_contents_lastword = (); + + my $extra = ""; + $extra = "|sort" if $SORT_REVERSE_CONCURRENTLY; + + open REVERSED, "$extra>$DBDIR/reverse.tmp" + or die "Failed to open output reverse file: $!"; + + my $changed = 0; + for my $archive (@archives) { + + my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz"; + next unless -f $filename; + # Note: ctime, because mtime is set back via rsync + my $ftime = (stat $filename)[10]; + next if defined $dbtime and $dbtime > $ftime; + print "$archive/$suite/$arch needs update\n"; + $changed++; + } + if ($changed) { + for my $archive (@archives) { + + my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz"; + next unless -f $filename; + print "Reading $archive/$suite/$arch...\n"; + + open CONT, "zcat $filename|$what" + or die $!; + while () { last if /^FILE/mo; } + if (eof(CONT)) { # no header found + close CONT; # explicit close to reset $. + open CONT, "zcat $filename|$what"; + } + while () { + my $data = ""; + my %data = (); + chomp; + print "Doing line ".($NR/1000)."k (out of approx 2.0M)\n" + if $NR % 250000 == 0; + /^(.+?)\s+(\S+)$/o; + my ($file, $value) = ($1, $2); + $value =~ s#[^,/]+/##og; + my @packages = split m/,/, $value; + for (@packages) { $packages_contents_nr{$_}++; my $lw = $packages_contents_lastword{$_} || "\0"; my $i=0; @@ -59,40 +109,115 @@ for my $archive (@archives) { for my $suite (@suites) { $i = 255 if $i > 255; $packages_contents{$_} .= pack "CC/a*", ($i, substr($file, $i)); $packages_contents_lastword{$_} = "$file\0"; + } + # Searches are case-insensitive + (my $nocase = $file) =~ tr [A-Z] [a-z]; + my $case = ($nocase eq $file) ? '-' : $file; + + print REVERSED (reverse $nocase)."\0".$case."\0". + (join ":$arch\0", @packages).":$arch\n"; } - # Searches are case-insensitive - $file =~ tr [A-Z] [a-z]; + close CONT; + + } + close REVERSED; + + print "Sorting reverse list if needed\n"; + system("cd $DBDIR && sort reverse.tmp > reverse.sorted &&". + " mv reverse.{sorted,tmp}") == 0 + or die "Failed to sort reverse" + unless $SORT_REVERSE_CONCURRENTLY; + + print "Writing filelist db\n"; + tie my %packages_contents_db, "DB_File", "$filelist_db.new", + O_RDWR|O_CREAT, 0666, $DB_BTREE + or die "Error creating DB: $!"; + while (my ($k, $v) = each(%packages_contents)) { + $packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k}) + . $v; + } + untie %packages_contents_db; - $file_reverse{reverse $file} = join "\0", @packages; + rename("$DBDIR/reverse.tmp", "$DBDIR/reverse_${suite}_${arch}.txt"); + + activate($filelist_db); + #FIXME: hardcoded archs. (debports has no contrib/non-free) + if ($arch ne 'avr32' and $arch ne 'm68k') { + system("ln", "-sf", basename($filelist_db), + "$DBDIR/filelists_${suite}_all.db") == 0 + or die "Oops"; + } } -}} + } -print "Writing databases...\n"; -my %packages_contents_db; -tie %packages_contents_db, "DB_File", "$DBDIR/packages_contents.db.new", - O_RDWR|O_CREAT, 0666, $DB_BTREE - or die "Error creating DB: $!"; -while (my ($k, $v) = each(%packages_contents)) { - $packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k}) - . $v; -} -untie %packages_contents_db; + my $go = 0; + my $suite_mtime = (stat "$DBDIR/reverse_$suite.db")[9]; + for my $file (glob "$DBDIR/reverse_${suite}_*.txt") { + $go = 1 if not defined $suite_mtime + or $suite_mtime < (stat $file)[9]; + } + next unless $go; -my %file_reverse_db; -tie %file_reverse_db, "DB_File", "$DBDIR/file_reverse.db.new", + print "Merging reverse path lists for ${suite}...\n"; + + open MERGED, "-|", "sort -m $DBDIR/reverse_${suite}_*.txt" + or die "Failed to open merged list"; + open FILENAMES, ">", "$DBDIR/filenames_$suite.txt.new" + or die "Failed to open filenames list"; + tie my %reverse_path_db, "DB_File", "$DBDIR/reverse_${suite}.db.new", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Error creating DB: $!"; -while (my ($x, $y) = each(%file_reverse)) { -# $v =~ s/.$//s; -# my $nr = $v; -# $nr =~ s/[^\000]//g; -# $nr = length($nr) + 1; # < number of hits -# if ($nr > $MAX_file_reverse) { -# $v = "\001" . $nr; -# } - $file_reverse_db{$x} = $y; + + my $lastpath = my $lastcasepath = my $lastfile = ""; + my %matches = (); + while () { + print "Doing line ".($NR/1000000)."M (out of approx. 20M)\n" + if $NR % 1000000 == 0; + chomp; + my @line = split m/\0/o, $_; + my $revpath = shift @line; + my $casepath = shift @line; + if ($revpath ne $lastpath) { + # Wrap: Do useful stuff with this ($lastpath, @matches) + if ($lastpath ne "") { + my @matches; + while (my ($k, $v) = each %matches) { + push @matches, join("\0", $k, @$v); + } + $reverse_path_db{$lastpath} = join "\1", @matches; + %matches = (); + } + $lastpath =~ s,/.*,,o; + if ($lastfile ne $lastpath) { + $lastfile = $lastpath; + print FILENAMES (reverse $lastfile)."\n"; + } + # + $lastpath = $revpath; + $lastcasepath = $casepath; + $matches{$casepath} = \@line; + next; +# } elsif ($lastcasepath ne "" and $casepath ne $lastcasepath) { +# warn reverse($revpath)." has more than one casepath: $casepath $lastcasepath\n"; + } + push @{$matches{$casepath}}, @line; + } + # Note: do useful stuff here too, for out last entry. Maybe prevent this by + # adding a fake ultimate entry? + { + my @matches; + while (my ($k, $v) = each %matches) { + push @matches, join("\0", $k, @$v); + } + $reverse_path_db{$lastpath} = join "\1", @matches; + } + + untie %reverse_path_db; + close FILENAMES; + close MERGED; + + activate("$DBDIR/filenames_$suite.txt"); + activate("$DBDIR/reverse_$suite.db"); } -untie %file_reverse_db; -rename("$DBDIR/packages_contents.db.new", "$DBDIR/packages_contents.db"); -rename("$DBDIR/file_reverse.db.new", "$DBDIR/file_reverse.db"); +# vim: set ts=4