#!/usr/bin/perl -w # Convert Contents.gz files into Sleepycat db files for efficient usage of # data # # $Id$ # # Copyright (C) 2006 Jeroen van Wolffelaar # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use strict; use lib './lib'; $| = 1; # Important, we want sorting and such to happen like in the C locale: binary, # without any fancy collation. FIXME: is this actually adequate? $ENV{"LC_ALL"} = 'C'; my $what = $ARGV[0] ? "head -10000|" : ""; # More RAM vs more disk I/O tradeoff parameters, does not change # functionality. True will always use more RAM at the benefit of less # temporary files, and is adviced when possible my $SORT_REVERSE_CONCURRENTLY = 1; use DB_File; use Storable; use File::Path; use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES @ARCHITECTURES ); &Packages::Config::init( './' ); my @archives = @ARCHIVES; my @suites = @SUITES; my @archs = @ARCHITECTURES; $DBDIR .= "/contents"; -d $DBDIR || mkpath( $DBDIR ); for my $suite (@suites) { for my $arch (@archs) { my $filelist_db = "$DBDIR/filelists_${suite}_${arch}.db"; my $dbtime = (stat $filelist_db)[9]; my %packages_contents = (); my %packages_contents_nr = (); my %packages_contents_lastword = (); my $extra = ""; $extra = "|sort" if $SORT_REVERSE_CONCURRENTLY; open REVERSED, "$extra>$DBDIR/reverse.tmp" or die "Failed to open output reverse file: $!"; my $changed = 0; for my $archive (@archives) { my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz"; next unless -f $filename; # Note: ctime, because mtime is set back via rsync my $ftime = (stat $filename)[10]; next if defined $dbtime and $dbtime > $ftime; print "$archive/$suite/$arch needs update\n"; $changed++; } if ($changed) { for my $archive (@archives) { my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz"; next unless -f $filename; print "Reading $archive/$suite/$arch...\n"; open CONT, "zcat $filename|$what" or die $!; while () {last if /^FILE/mo;} open CONT, "zcat $filename|$what" if eof(CONT); while () { my $data = ""; my %data = (); chomp; print "Doing line ".($./1000)."k (out of approx 1.5M)\n" if $. % 250000 == 0; /^(.+?)\s+(\S+)$/o; my ($file, $value) = ($1, $2); $value =~ s#[^,/]+/##og; my @packages = split /,/, $value; for (@packages) { $packages_contents_nr{$_}++; my $lw = $packages_contents_lastword{$_} || "\0"; my $i=0; while (substr($file,$i,1) eq substr($lw,$i++,1)) {} $i--; $i = 255 if $i > 255; $packages_contents{$_} .= pack "CC/a*", ($i, substr($file, $i)); $packages_contents_lastword{$_} = "$file\0"; } # Searches are case-insensitive $file =~ tr [A-Z] [a-z]; print REVERSED (reverse $file)."\0".(join ":$arch\0", @packages).":$arch\n"; } close CONT; } close REVERSED; print "Sorting reverse list if needed\n"; system("cd $DBDIR && sort reverse.tmp > reverse.sorted && mv reverse.{sorted,tmp}") == 0 or die "Failed to sort reverse" unless $SORT_REVERSE_CONCURRENTLY; print "Writing filelist db\n"; tie my %packages_contents_db, "DB_File", "$filelist_db.new", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Error creating DB: $!"; while (my ($k, $v) = each(%packages_contents)) { $packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k}) . $v; } untie %packages_contents_db; rename("$DBDIR/reverse.tmp", "$DBDIR/reverse_${suite}_${arch}.txt"); rename("$filelist_db.new", $filelist_db); system("ln -sf $filelist_db $DBDIR/filelists_${suite}_all.db") == 0 or die "Oops"; } } my $go = 0; my $suite_mtime = (stat "$DBDIR/reverse_$suite.db")[9]; for my $file (glob "$DBDIR/reverse_${suite}_*.txt") { $go = 1 if not defined $suite_mtime or $suite_mtime < (stat $file)[9]; } next unless $go; print "Merging reverse path lists for ${suite}...\n"; open MERGED, "sort -m $DBDIR/reverse_${suite}_*.txt |" or die "Failed to open merged list"; open FILENAMES, "> $DBDIR/filenames_$suite.txt.new" or die "Failed to open filenames list"; tie my %reverse_path_db, "DB_File", "$DBDIR/reverse_${suite}.db.new", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Error creating DB: $!"; my $lastpath = ""; my $lastfile = ""; my @matches = (); while () { print "Doing line ".($./1000000)."M (out of approx. 16M)\n" if $. % 1000000 == 0; chomp; my @line = split /\0/o, $_; my $revpath = shift @line; if ($revpath ne $lastpath) { # Wrap: Do useful stuff with this ($lastpath, @matches) $reverse_path_db{$lastpath} = join "\0", @matches if $lastpath ne ""; $lastpath =~ s,/.*,,o; if ($lastfile ne $lastpath) { $lastfile = $lastpath; print FILENAMES (reverse $lastfile)."\n"; } # $lastpath = $revpath; @matches = @line; next; } push @matches, @line } # Note: do useful stuff here too, for out last entry. Maybe prevent this by # adding a fake ultimate entry? $reverse_path_db{$lastpath} = join "\0", @matches; untie %reverse_path_db; close FILENAMES; close MERGED; rename "$DBDIR/filenames_$suite.txt.new", "$DBDIR/filenames_$suite.txt"; rename "$DBDIR/reverse_$suite.db.new", "$DBDIR/reverse_$suite.db"; } # vim: set ts=4