X-Git-Url: https://git.deb.at/?a=blobdiff_plain;f=bin%2Fparse-contents;h=767aad433ff57b01e5bf0c41e5b86994f4aea252;hb=4cb36e543bb889e040b1c95b6d697582e28eeec2;hp=ceb0cb0e5bce0eea3b958b93d6e5dfc4bc02a2d4;hpb=eca3f1d1d3ddba9041b4cae9cc7c54bcb95cc3ef;p=deb%2Fpackages.git

diff --git a/bin/parse-contents b/bin/parse-contents
index ceb0cb0..767aad4 100755
--- a/bin/parse-contents
+++ b/bin/parse-contents
@@ -22,35 +22,79 @@
 use strict;
 use lib './lib';
 
-my $what = $ARGV[0] ? "head -100000|" : "";
+$| = 1;
+
+# Important, we want sorting and such to happen like in the C locale: binary,
+# without any fancy collation. FIXME: is this actually adequate?
+$ENV{"LC_ALL"} = 'C';
+
+my $what = $ARGV[0] ? "head -10000|" : "";
+
+# More RAM vs more disk I/O tradeoff parameters, does not change
+# functionality. True will always use more RAM at the benefit of less
+# temporary files, and is adviced when possible
+my $SORT_REVERSE_CONCURRENTLY = 1;
 
 use DB_File;
 use Storable;
-use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES );
+use File::Path;
+use Packages::Config qw( $TOPDIR $DBDIR @ARCHIVES @SUITES @ARCHITECTURES );
 &Packages::Config::init( './' );
-my %packages_contents = ();
-my %packages_contents_nr = ();
-my %packages_contents_lastword = ();
-my %file_reverse = ();
-
-my @archives =( 'us'); #@ARCHIVES
-my @suites = ('stable');#@SUITES
-
-for my $archive (@archives) { for my $suite (@suites) {
-
-	print "Reading $archive/$suite/i386...\n";
-	open CONT, "zcat /org/ftp.debian.org/ftp/dists/stable/Contents-i386.gz|$what";
-	while (1) {$_ = <CONT>;last if /^FILE/mo;}
-	while (<CONT>) {
-		my $data = "";
-		my %data = ();
-		chomp;
-		print "Doing line $.\n" if $. % 10000 == 0;
-		/^(\S+)\s+(\S+)/;
-		my ($file, $value) = ($1, $2);
-		$value =~ s#[^,/]+/##g;
-		my @packages = split /,/, $value;
-		for (@packages) {
+
+my @archives = @ARCHIVES;
+my @suites = @SUITES;
+my @archs = @ARCHITECTURES;
+
+$DBDIR .= "/contents";
+-d $DBDIR || mkpath( $DBDIR );
+
+for my $suite (@suites) {
+    for my $arch (@archs) {
+
+	my $filelist_db = "$DBDIR/filelists_${suite}_${arch}.db";
+	my $dbtime = (stat $filelist_db)[9];
+	my %packages_contents = ();
+	my %packages_contents_nr = ();
+	my %packages_contents_lastword = ();
+	
+	my $extra = "";
+	$extra = "|sort" if $SORT_REVERSE_CONCURRENTLY;
+	
+	open REVERSED, "$extra>$DBDIR/reverse.tmp"
+	    or die "Failed to open output reverse file: $!";
+
+	my $changed = 0;
+	for my $archive (@archives) { 
+
+	    my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz";
+	    next unless -f $filename;
+	    # Note: ctime, because mtime is set back via rsync
+	    my $ftime = (stat $filename)[10];
+	    next if defined $dbtime and $dbtime > $ftime;
+	    print "$archive/$suite/$arch needs update\n";
+	    $changed++;
+	}
+	if ($changed) {
+	    for my $archive (@archives) { 
+
+		my $filename = "$TOPDIR/archive/$archive/$suite/Contents-$arch.gz";
+		next unless -f $filename;
+		print "Reading $archive/$suite/$arch...\n";
+		
+		open CONT, "zcat $filename|$what"
+		    or die $!;
+		while (<CONT>) {last if /^FILE/mo;}
+		open CONT, "zcat $filename|$what" if eof(CONT);
+		while (<CONT>) {
+		    my $data = "";
+		    my %data = ();
+		    chomp;
+		    print "Doing line ".($./1000)."k (out of approx 1.5M)\n" if $. % 250000 == 0;
+		    /^(.+?)\s+(\S+)$/o;
+		    my ($file, $value) = ($1, $2);
+		    $value =~ s#[^,/]+/##og;
+		    my @packages = split /,/, $value;
+		    for (@packages) {
 			$packages_contents_nr{$_}++;
 			my $lw = $packages_contents_lastword{$_} || "\0";
 			my $i=0;
@@ -59,40 +103,92 @@ for my $archive (@archives) { for my $suite (@suites) {
 			$i = 255 if $i > 255;
 			$packages_contents{$_} .= pack "CC/a*", ($i, substr($file, $i));
 			$packages_contents_lastword{$_} = "$file\0";
+		    }
+		    # Searches are case-insensitive
+		    $file =~ tr [A-Z] [a-z];
+		    
+		    print REVERSED (reverse $file)."\0".(join ":$arch\0", @packages).":$arch\n";
 		}
-		# Searches are case-insensitive
-		$file =~ tr [A-Z] [a-z];
-
-		$file_reverse{reverse $file} = join "\0", @packages;
+		close CONT;
+		
+	    }
+	    close REVERSED;
+	    
+	    print "Sorting reverse list if needed\n";
+	    system("cd $DBDIR && sort reverse.tmp > reverse.sorted && mv reverse.{sorted,tmp}") == 0
+		or die "Failed to sort reverse"
+		unless $SORT_REVERSE_CONCURRENTLY;
+	    
+	    print "Writing filelist db\n";
+	    tie my %packages_contents_db, "DB_File", "$filelist_db.new",
+	    O_RDWR|O_CREAT, 0666, $DB_BTREE
+		or die "Error creating DB: $!";
+	    while (my ($k, $v) = each(%packages_contents)) {
+		$packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k})
+		    . $v;
+	    }
+	    untie %packages_contents_db;
+	
+	    rename("$DBDIR/reverse.tmp", "$DBDIR/reverse_${suite}_${arch}.txt");
+	
+	    rename("$filelist_db.new", $filelist_db);
+	    system("ln -sf $filelist_db $DBDIR/filelists_${suite}_all.db") == 0
+		or die "Oops";
 	}
-}}
+    }
+			  
+    my $go = 0;
+    my $suite_mtime = (stat "$DBDIR/reverse_$suite.db")[9];
+    for my $file (glob "$DBDIR/reverse_${suite}_*.txt") {
+	$go = 1 if not defined $suite_mtime
+	    or $suite_mtime < (stat $file)[9];
+    }
+    next unless $go;
 
-print "Writing databases...\n";
-my %packages_contents_db;
-tie %packages_contents_db, "DB_File", "$DBDIR/packages_contents.db.new",
-	O_RDWR|O_CREAT, 0666, $DB_BTREE
-	or die "Error creating DB: $!";
-while (my ($k, $v) = each(%packages_contents)) {
-	$packages_contents_db{$k} = (pack "L", $packages_contents_nr{$k})
-		. $v;
-}
-untie %packages_contents_db;
+    print "Merging reverse path lists for ${suite}...\n";
 
-my %file_reverse_db;
-tie %file_reverse_db, "DB_File", "$DBDIR/file_reverse.db.new",
-	O_RDWR|O_CREAT, 0666, $DB_BTREE
+    open MERGED, "sort -m $DBDIR/reverse_${suite}_*.txt |"
+	or die "Failed to open merged list";
+    open FILENAMES, "> $DBDIR/filenames_$suite.txt.new"
+	or die "Failed to open filenames list";
+    tie my %reverse_path_db, "DB_File", "$DBDIR/reverse_${suite}.db.new",
+    O_RDWR|O_CREAT, 0666, $DB_BTREE
 	or die "Error creating DB: $!";
-while (my ($x, $y) = each(%file_reverse)) {
-#	$v =~ s/.$//s;
-#	my $nr = $v;
-#	$nr =~ s/[^\000]//g;
-#	$nr = length($nr) + 1; # < number of hits
-#	if ($nr > $MAX_file_reverse) {
-#		$v = "\001" . $nr;
-#	}
-	$file_reverse_db{$x} = $y;
+
+    my $lastpath = "";
+    my $lastfile = "";
+    my @matches = ();
+    while (<MERGED>) {
+	print "Doing line ".($./1000000)."M (out of approx. 16M)\n"
+	    if $. % 1000000 == 0;
+	chomp;
+	my @line = split /\0/o, $_;
+	my $revpath = shift @line;
+	if ($revpath ne $lastpath) {
+	    # Wrap: Do useful stuff with this ($lastpath, @matches)
+	    $reverse_path_db{$lastpath} = join "\0", @matches if $lastpath ne "";
+	    $lastpath =~ s,/.*,,o;
+	    if ($lastfile ne $lastpath) {
+		$lastfile = $lastpath;
+		print FILENAMES (reverse $lastfile)."\n";
+	    }
+	    #
+	    $lastpath = $revpath;
+	    @matches = @line;
+	    next;
+	}
+	push @matches, @line
+	}
+    # Note: do useful stuff here too, for out last entry. Maybe prevent this by
+    # adding a fake ultimate entry?
+    $reverse_path_db{$lastpath} = join "\0", @matches;
+    
+    untie %reverse_path_db;
+    close FILENAMES;
+    close MERGED;
+    
+    rename "$DBDIR/filenames_$suite.txt.new", "$DBDIR/filenames_$suite.txt";
+    rename "$DBDIR/reverse_$suite.db.new", "$DBDIR/reverse_$suite.db";
 }
-untie %file_reverse_db;
 
-rename("$DBDIR/packages_contents.db.new", "$DBDIR/packages_contents.db");
-rename("$DBDIR/file_reverse.db.new", "$DBDIR/file_reverse.db");
+# vim: set ts=4