#!/usr/bin/perl # # Script to extract files from Debian packages # Copyright 2004-2007 Frank Lichtenheld # # based on a shell script which was # Copyright 2003 Noel Köthe # Copyright 2004 Martin Schulze # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. use strict; use warnings; use FindBin; use lib "$FindBin::Bin/../lib"; use lib "$FindBin::Bin"; use Getopt::Long; use File::Temp qw( tempdir ); use File::Path; use File::Copy; use File::Basename; #use Digest::SHA1; use Deb::Versions; use Parse::DebControl; use Parse::DebianChangelog; use Cwd; use Fcntl qw(:DEFAULT :flock); use Data::Dumper; use DB_File; use constant PKGPOOL => 1; use constant DIGESTPOOL => 2; my $PROGNAME = 'extract_files'; my $MAXWARN = 100; my $TEMPDIR; my $directory = cwd()."/pool"; my $dumpfile = ''; my $cachefile = ''; my $target = cwd()."/extracted_files"; my $workdir = ''; my $configdir = cwd()."/etc"; my ( $verbose, $version, $help, $debug, $force, $use_dump ); my %opthash = ( 'verbose|v' => \$verbose, 'force|f' => \$force, 'directory|d=s' => \$directory, 'config|c=s' => \$configdir, 'target|t=s' => \$target, 'workdir|w=s' => \$workdir, 'cachefile=s' => \$cachefile, 'dumpfile=s' => \$dumpfile, 'use_dump' => \$use_dump, 'version' => \$version, 'debug' => \$debug, 'help' => \$help, ); my (%src_packages, %bin_packages, %cache); my %stats = ( src_pkgs => 0, src_cache => 0, already_extracted => 0, bin_pkgs => 0, bin_cache => 0, ); Getopt::Long::config('no_getopt_compat', 'no_auto_abbrev'); GetOptions(%opthash) or do_error( "couldn't parse commandline parameters" ); $verbose ||= $debug; $directory =~ s,/+$,,o; if ($workdir) { $TEMPDIR = tempdir( 'pdo_extract_file.XXXXXX', DIR => $workdir, CLEANUP => 1 ); } else { $TEMPDIR = tempdir( 'pdo_extract_file.XXXXXX', CLEANUP => 1 ); } ################################################## # OUTPUT/LOGGING sub do_error { die "$PROGNAME: FATAL: @_\n"; } my $no_warnings = 0; sub do_warning { warn "$PROGNAME: WARNING: @_\n"; if (++$no_warnings > $MAXWARN) { do_error( "too many warnings ($MAXWARN)" ); } } sub do_info { if ($verbose) { print "$PROGNAME: INFO: @_\n"; } } sub do_debug { if ($debug) { print "$PROGNAME: DEBUG: @_\n"; } } sub add_log { my $log = shift; do_debug( @_ ); $$log .= localtime().": @_\n"; } ################################################## # PACKAGE HANDLING (UNPACKING/CLEANUP) sub unpack_srcpkg { my ( $pkgname, $dscname, $log ) = @_; chdir( $TEMPDIR ) or do_error( "couldn't change working directory to $TEMPDIR" ); add_log( $log, "dpkg-source -sn -x $dscname $pkgname+source" ); system("dpkg-source", "-sn", "-x", $dscname, "$pkgname+source" ) == 0 or do { do_warning( "couldn't unpack $dscname: $!" ); add_log( $log, "couldn't unpack $dscname: $!" ); return; }; return "$pkgname+source"; } sub unpack_binpkg { my ( $pkgname, $debname, $log ) = @_; add_log( $log, "unpacking binary package $pkgname" ); mkdir( "$TEMPDIR/$pkgname" ) or do { do_warning( "couldn't create directory $TEMPDIR/$pkgname" ); add_log( $log, "couldn't create directory $TEMPDIR/$pkgname" ); return; }; add_log( $log, "dpkg-deb --extract $debname $TEMPDIR/$pkgname" ); system("dpkg-deb", "--extract", $debname, "$TEMPDIR/$pkgname" ) == 0 or do { do_warning( "couldn't unpack $debname" ); add_log( $log, "couldn't unpack $debname" ); return; }; return 1; } sub unpack_allbinpkg { my ($pkg_data, $log) = @_; my %already_seen; foreach my $pkg (@{$pkg_data->{bins}}) { next if $already_seen{$pkg->{bin_name}}; # some assumptions about sane version numbers included unpack_binpkg($pkg->{bin_name}, $pkg->{deb}, $log ); $already_seen{$pkg->{bin_name}}++; } } sub cleanup_binpkg { my ($pkg_data) = @_; foreach my $pkg (keys %{$pkg_data->{bin_list}}) { # rmtree should do that itself, but there seems to be a bug somewhere system( "chmod", "-R", "u+rwx", "$TEMPDIR/$pkg" ); rmtree( "$TEMPDIR/$pkg" ); } } ################################################## # POOL HANDLING sub pkg_pool_directory { my ($pkg_data) = @_; my $name = $pkg_data->{src_name}; my $version = $pkg_data->{src_version}; my $dscname = $pkg_data->{dsc}; my $dir = ""; # I would prefer $name_$version but lets be backward compatible # in case someone depends on the naming if ($name =~ /^(lib.)/o) { # $dir .= "$1/$name/${name}_$version"; $dir .= "$1/$name/".basename($dscname, '.dsc'); } else { # $dir .= substr($name,0,1)."/$name/${name}_$version"; $dir .= substr($name,0,1)."/$name/".basename($dscname, '.dsc'); } return $dir; } sub to_update { my ($pkg_data, $config_data, $log) = @_; if ($config_data->{structure} == PKGPOOL) { my $dir = "$target/".pkg_pool_directory( $pkg_data ); if (!$force && -d $dir && -f "$dir/log") { (system( "touch", "$dir/log" ) == 0) or do_warning( "touch of $dir/log failed" ); return 0; } else { rmtree( $dir ); return 1; } } elsif ($config_data->{structure} == DIGESTPOOL) { die "UNIMPLEMENTED!"; } else { do_error( "unknown pool structure $config_data->{structure}" ); } } sub write_log ($$) { my ($dir, $log) = @_; open my $logfh, ">$dir/log" or do_error( "couldn't open log file $dir/log.\n$log" ); flock $logfh, LOCK_EX or do_error( "couldn't lock log file $dir/log" );; print $logfh $log; close $logfh or do_warning( "couldn't close log file $dir/log" ); } ################################################## # EXTRACTION sub extract_copyright_to_pkgpool { my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_; add_log( $log, "copy copyright file from source package" ); my $src_tgt = "$target_dir/copyright"; copy( "$source_dir/debian/copyright", $src_tgt ) or add_log( $log, "seems to have failed: $!" ); foreach my $bin_pkg (keys %{$pkg_data->{bin_list}}) { my $usd = "$TEMPDIR/$bin_pkg/usr/share/doc/$bin_pkg"; my $cpy = "$usd/copyright"; my $tgt = "$target_dir/$bin_pkg.copyright"; if (-f $cpy) { add_log( $log, "copy copyright file from binary package $bin_pkg" ); copy( $cpy, $tgt ) or add_log( $log, "seems to have failed: $!" ); } elsif (-l $cpy ) { add_log( $log, "copyright file $cpy is symlink, I can't handle that" ); } elsif (-l $usd) { add_log( $log, "doc directory $usd is symlink" ); my $link = readlink($usd) or add_log( $log, "readlink $usd failed" ); if ($link && $link =~ m,^(?:\./)?(\S+)/?$,o) { # do a little sanity check my $pkg2 = $1; if ($pkg_data->{bin_list}{$pkg2}) { add_log( $log, "symlink points to $pkg2, make symlink to copyright file" ); (system("ln", "-s", "$pkg2.copyright", $tgt ) == 0) or add_log( $log, "symlink creation failed" ); } else { add_log( $log, "symlink points to $pkg2, don't know what to do with that" ); } } else { add_log( $log, "link seems fishy, not using" ); } } unless (-e $tgt || -l $tgt) { # if it is a link, we can't be sure that the target package was already processed add_log( $log, "copyright file $tgt still doesn't exist" ); if (-e $src_tgt) { add_log( $log, "copyright file of the source package exists, make symlink" ); (system("ln", "-s", "copyright", $tgt ) == 0) or add_log( $log, "symlink generation failed" ); } else { add_log( $log, "give up on $bin_pkg" ); (system( "touch", "$tgt.ERROR" ) == 0) or add_log( $log, "even the touch of $tgt.ERROR failed :(" ); } } } #foreach $bin_pkg unless (-e $src_tgt) { add_log( $log, "copyright file $src_tgt still doesn't exist" ); # take one of the binary packages, prefering one that has # the same name as the source package foreach my $bin_pkg (($pkg_data->{src_name}, keys %{$pkg_data->{bin_list}})) { if (-e "$target_dir/$bin_pkg.copyright") { add_log( $log, "copyright file $target_dir/$bin_pkg.copyright seems like a good guess to me, make a symlink" ); (system("ln", "-s", "$bin_pkg.copyright", $src_tgt ) == 0) or do { add_log( $log, "symlink generation failed" ); next; }; last; } } unless (-e $src_tgt) { add_log( $log, "give up" ); (system( "touch", "$src_tgt.ERROR" ) == 0) or add_log( $log, "even the touch of $src_tgt.ERROR failed :(" ); } } } sub extract_changelog_to_pkgpool { my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_; add_log( $log, "copy changelog file from source package" ); my $src_changelog = copy( "$source_dir/debian/changelog", "$target_dir/changelog.txt" ); if ($src_changelog) { add_log( $log, "changelog file sucessfully copied" ); } else { add_log( $log, "seems to have failed: $!" ); } add_log( $log, "create enhanced HTML version" ); my $chg = Parse::DebianChangelog->init; my $parsed = $chg->parse( { infile => "$source_dir/debian/changelog" } ); if ($parsed) { $chg->html( { outfile => "$target_dir/changelog.html", template => "$configdir/tmpl/default.tmpl" } ); add_log( $log, scalar $chg->get_parse_errors ) if $chg->get_parse_errors; } else { do_warning( $chg->get_error ); add_log( $log, $chg->get_error ); } } sub manage_current_link { my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_; my $parent_dir = dirname($target_dir); my $dirname = basename($target_dir); my $current_link = "$parent_dir/current"; add_log( $log, "parent_dir=$parent_dir; dirname=$dirname" ); unless (-l $current_link) { add_log( $log, "create new current link" ); (chdir( $parent_dir ) and not system( 'ln', '-s', $dirname, 'current' )) or add_log( $log, "creating new current link failed: $!" ); } else { my $old_target = readlink( $current_link ); (my $old_version = $old_target) =~ s/^[^_]*_//o; if (version_cmp( $pkg_data->{src_version}, $old_version) > 0) { add_log( $log, "old_version=$old_version; overwriting current link" ); (chdir( $parent_dir ) and unlink( 'current' ) and not system( 'ln', '-s', $dirname, 'current' )) or add_log( $log, "overwriting current link failed: $!" ); } else { add_log( $log, "old_version=$old_version; not touching current link" ); } } } sub extract_files { my ($pkg_data, $config_data) = @_; my $log = ""; add_log( \$log, "process source package $pkg_data->{src_name} ($pkg_data->{src_version})" ); unless (to_update( $pkg_data, $config_data, \$log )) { $stats{already_extracted}++; do_debug( "source package $pkg_data->{src_name} ($pkg_data->{src_version}) doesn't need extracting" ); return; } if (my $source_dir = unpack_srcpkg( $pkg_data->{src_name}, $pkg_data->{dsc}, \$log )) { $source_dir = "$TEMPDIR/$source_dir"; unpack_allbinpkg($pkg_data, \$log); my $target_dir = "$target/".pkg_pool_directory($pkg_data); add_log( \$log, "source_dir=$source_dir; target_dir=$target_dir" ); mkpath( $target_dir ); if ($config_data->{structure} == PKGPOOL) { extract_copyright_to_pkgpool( $pkg_data, $config_data, \$log, $source_dir, $target_dir ); extract_changelog_to_pkgpool( $pkg_data, $config_data, \$log, $source_dir, $target_dir ); manage_current_link( $pkg_data, $config_data, \$log, $source_dir, $target_dir ); } elsif ($config_data->{structure} == DIGESTPOOL) { die "UNIMPLEMENTED!"; } else { do_error( "unknown pool structure $config_data->{structure}" ); } # rmtree should do that itself, but there seems to be a bug somewhere system( "chmod", "-R", "u+rwx", "$source_dir" ); rmtree( $source_dir ); cleanup_binpkg($pkg_data); write_log( $target_dir, $log ); } } sub extract_from_all { my ( $src_packages ) = @_; unless (-d $target) { mkpath( $target ) or do_error( "couldn't create target directory" ); } # TODO: make configurable my %config = ( structure => PKGPOOL, ); do_info( scalar(keys(%$src_packages))." source packages to process" ); foreach my $p (keys %$src_packages) { foreach my $v (keys %{$src_packages->{$p}}) { extract_files( $src_packages->{$p}{$v}, \%config ); } } } ################################################## # COLLECTING INFORMATION sub merge_src_bin_packages { my ( $src_packages, $bin_packages ) = @_; foreach my $p (keys %$bin_packages) { # packages foreach my $v (keys %{$bin_packages->{$p}}) { # versions foreach my $a (keys %{$bin_packages->{$p}{$v}}) { # architectures my %bin_data = %{$bin_packages->{$p}{$v}{$a}}; if (exists $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}) { $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bins} ||= []; push @{$src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bins}}, \%bin_data; $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bin_list}{$p}++; } } } } return $src_packages; } sub read_dsc { my ( $dscname ) = @_; my $parser = Parse::DebControl->new(); my ( $raw_data, $pkg_data ); my $dsccontent = $cache{$dscname}; unless ($dsccontent) { open my $dscfh, "<", $dscname or do { do_warning( "reading file $dscname failed" ); return; }; $dsccontent = ""; while (<$dscfh>) { next if /^\#/o; if (/^-----BEGIN PGP SIGNED MESSAGE/o) { while (<$dscfh> =~ /\S/) {}; # skip Hash: line and similar next; } if (/^-----BEGIN PGP SIGNATURE/o) { last; # stop reading } $dsccontent .= $_; } $cache{$dscname} = $dsccontent; } else { $stats{src_cache}++; if ($debug) { (my $begin = substr($dsccontent,0,20)) =~ s/\n/\\n/go; do_debug( "CACHE HIT: $dscname ($begin)" ); } } unless ( $raw_data = $parser->parse_mem( $dsccontent, { discardCase => 1 } ) ) { do_warning( "parsing file $dscname failed.\n$dsccontent" ); return; } my $no_chunks = @$raw_data; if ($no_chunks != 1) { do_warning( "expected exactly one chunk in .dsc file, got $no_chunks" ); return; } $pkg_data = { src_name => $raw_data->[0]{source}, src_version => $raw_data->[0]{version}, dsc => $dscname, }; unless( $pkg_data->{src_name} && defined($pkg_data->{src_version}) && $pkg_data->{dsc} ) { use Data::Dumper; do_error( "something fishy happened.\n", Dumper( $pkg_data ) ); } do_debug( "found source package $pkg_data->{src_name}, version $pkg_data->{src_version}" ); $stats{src_pkgs}++; return $pkg_data; } sub read_deb { my ( $debname ) = @_; my $parser = Parse::DebControl->new(); my ( $raw_data, $pkg_data ); if ($cache{$debname}) { $stats{bin_cache}++; if ($debug) { (my $begin = substr($cache{$debname},0,20)) =~ s/\n/\\n/go; do_debug( "CACHE HIT: $debname ($begin)" ); } } $cache{$debname} ||= qx/dpkg-deb --info "$debname" control/; my $control = $cache{$debname}; unless ( $raw_data = $parser->parse_mem( $control, { discardCase => 1 } ) ) { do_warning( "parsing control information <<$control>> of file $debname failed" ); return; } my $no_chunks = @$raw_data; if ($no_chunks != 1) { do_warning( "expected exactly one chunk in .deb control information, got $no_chunks" ); return; } $pkg_data = { bin_name => $raw_data->[0]{package}, bin_version => $raw_data->[0]{version}, bin_arch => $raw_data->[0]{architecture}, bin_src => $raw_data->[0]{source} || $raw_data->[0]{package},, bin_src_version => $raw_data->[0]{version}, deb => $debname, }; if ($pkg_data->{bin_src} =~ /^([\w.+-]+)\s*\(\s*=\s*([^\s\)])\s*\)\s*$/) { $pkg_data->{bin_src} = $1; $pkg_data->{bin_src_version} = $2; } do_debug( "found binary package $pkg_data->{bin_name}, version $pkg_data->{bin_version}, architecture $pkg_data->{bin_arch}" ); $stats{bin_pkgs}++; return $pkg_data; } sub collect_deb { my ( $debname ) = @_; do_debug( "processing deb file $debname" ); my $pkg_data = read_deb( $debname ); return unless $pkg_data; if (exists $bin_packages{$pkg_data->{bin_name}}{$pkg_data->{bin_version}}{$pkg_data->{bin_arch}}) { do_warning( "duplicated package $pkg_data->{bin_name}, version {$pkg_data->{bin_version}{$pkg_data->{bin_arch}}" ); return; } else { $bin_packages{$pkg_data->{bin_name}}{$pkg_data->{bin_version}}{$pkg_data->{bin_arch}} = $pkg_data; } } sub collect_dsc { my ( $dscname ) = @_; do_debug( "processing dsc file $dscname" ); my $pkg_data = read_dsc( $dscname ); return unless $pkg_data; if (exists $src_packages{$pkg_data->{src_name}}{$pkg_data->{src_version}}) { do_warning( "duplicated package $pkg_data->{src_name}, version {$pkg_data->{src_version}" ); return; } else { $src_packages{$pkg_data->{src_name}}{$pkg_data->{src_version}} = $pkg_data; } } sub read_sub { my ( $dir ) = @_; do_debug( "processing directory $dir" ); opendir my $dh, $dir or do_error( "couldn't open directory $dir" ); while( my $entry = readdir $dh ) { chomp $entry; next if $entry =~ /^\.\.?$/o; my $fullname = "$dir/$entry"; read_sub( $fullname ) if -d $fullname; collect_dsc( $fullname ) if -f _ && ( $fullname =~ /\.dsc$/o ); collect_deb( $fullname ) if -f _ && ( $fullname =~ /\..?deb$/o ); } closedir $dh or do_warning( "couldn't close directory $dir" ); } ################################################## # MAIN PROGRAM do_info( "Using working directory $TEMPDIR" ); if ($use_dump) { do_info( "load information from dump file" ); open DUMP, '<', $dumpfile or do_error( "couldn't open dump file $dumpfile: $!" ); my $info = join "", ; eval $info; close DUMP or do_warning( "couldn't close dump file: $!" ); } else { do_info( "collect information (in $directory)" ); if ($cachefile) { tie %cache, 'DB_File', $cachefile, O_CREAT|O_RDWR, 0640 or die "E: tie with file $cachefile failed: $!"; } read_sub( $directory ); #FIXME: "untie attempted while 1 inner references still exist" # untie %cache if tied %cache; do_info( "postprocess collected information" ); merge_src_bin_packages( \%src_packages, \%bin_packages ); if ($dumpfile) { do_info( "dump backup of collected information" ); open DUMP, '>', $dumpfile or do_error( "couldn't open dump file $dumpfile: $!" ); print DUMP Data::Dumper->Dump( [ \%src_packages ], [ '*src_packages' ] ); close DUMP or do_warning( "couldn't close dump file: $!" ); } } do_info( "begin extracting files" ); extract_from_all( \%src_packages ); do_info( <