From: Frank Lichtenheld Date: Sun, 17 Jun 2007 15:13:03 +0000 (+0200) Subject: parse-translations: new script to parse the Translation files X-Git-Url: https://git.deb.at/w?a=commitdiff_plain;h=36cd772d5715368c75f6aa7bef3dc526aa876a76;p=deb%2Fpackages.git parse-translations: new script to parse the Translation files --- diff --git a/bin/parse-translations b/bin/parse-translations new file mode 100755 index 0000000..436e435 --- /dev/null +++ b/bin/parse-translations @@ -0,0 +1,93 @@ +#!/usr/bin/perl -w +# Convert Translation.gz files into Sleepycat db files for efficient usage of +# data +# +# $Id$ +# +# Copyright (C) 2006 Jeroen van Wolffelaar +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +use strict; +use warnings; +use lib './lib'; + +$| = 1; + +# max. distinct results for a given package postfix +my $MAX_PACKAGE_POSTFIXES = 100; + +use DB_File; +use Storable; +use File::Path; +use Digest::MD5; +use Deb::Versions; +use Lingua::Stem v0.82; +use Search::Xapian; +use Packages::Config qw( $TOPDIR $DBDIR @DDTP_LANGUAGES ); +&Packages::Config::init( './' ); +my %descriptions = (); + +$/ = ""; + +-d $DBDIR || mkpath( $DBDIR ); + +foreach my $lang (@DDTP_LANGUAGES) { + print "Reading Translations for $lang..."; + open PKG, "zcat $TOPDIR/archive/*/*/*/i18n/Translation-$lang.gz|"; + my $count = 0; + while () { + next if /^\s*$/; + my $data = ""; + my %data = (); + chomp; + s/\n /\377/g; + while (/^(\S+):\s*(.*)\s*$/mg) { + my ($key, $value) = ($1, $2); + $value =~ s/\377/\n /g; + $key =~ tr [A-Z] [a-z]; + $data{$key} = $value; + } + # Skip double descriptions + next if exists($descriptions{$data{"description-md5"}}{$lang}); + # some weirdnesses in the files + next unless defined $data{"description-".lc($lang)}; + $descriptions{$data{"description-md5"}}{$lang} = $data{"description-".lc($lang)}; + $count++; + } + print "($count)\n"; +} +close PKG; + +print "Writing database (".scalar(keys %descriptions)." unique descriptions)...\n"; +my %descriptions_db; +tie %descriptions_db, "DB_File", "$DBDIR/descriptions_translated.db.new", + O_RDWR|O_CREAT, 0666, $DB_BTREE + or die "Error creating DB: $!"; +while (my ($md5, $v) = each(%descriptions)) { + my $str = ""; + while (my ($lang, $desc) = each %$v) { + unless ($lang && $desc) { + warn "MD5: $md5 LANG: $lang DESC: $desc\n"; + exit; + } + $str .= "$lang\001$desc\000"; + } + + $descriptions_db{$md5} = $str; +} +untie %descriptions_db; + +rename("$DBDIR/descriptions_translated.db.new", + "$DBDIR/descriptions_translated.db"); diff --git a/cron.d/200process_archive b/cron.d/200process_archive index c3d6b58..6fa394e 100755 --- a/cron.d/200process_archive +++ b/cron.d/200process_archive @@ -11,3 +11,5 @@ date date ./bin/parse-contents date +./bin/parse-translations +date diff --git a/lib/Packages/Config.pm b/lib/Packages/Config.pm index 8252a1b..96f8b67 100644 --- a/lib/Packages/Config.pm +++ b/lib/Packages/Config.pm @@ -9,11 +9,11 @@ use Packages::CGI qw( :DEFAULT error ); our @ISA = qw( Exporter ); our ( $TOPDIR, $DBDIR, $TEMPLATEDIR, $CACHEDIR, $ROOT, - @LANGUAGES, $LOCALES, + @LANGUAGES, @DDTP_LANGUAGES, $LOCALES, @SUITES, @SECTIONS, @ARCHIVES, @ARCHITECTURES, @PRIORITIES, %FTP_SITES ); our @EXPORT_OK = qw( $TOPDIR $DBDIR $TEMPLATEDIR $CACHEDIR $ROOT - @LANGUAGES $LOCALES + @LANGUAGES @DDTP_LANGUAGES $LOCALES @SUITES @SECTIONS @ARCHIVES @ARCHITECTURES @PRIORITIES %FTP_SITES ); our %EXPORT_TAGS = ( all => [ @EXPORT_OK ] ); @@ -39,6 +39,7 @@ sub init { $FTP_SITES{us} = $1 if /^\s*ftpsite="?([^\"]*)"?\s*$/o; $FTP_SITES{$1} = $2 if /^\s*(\w+)_ftpsite="?([^\"]*)"?\s*$/o; @LANGUAGES = split(/\s+/, $1) if /^\s*polangs="?([^\"]*)"?\s*$/o; + @DDTP_LANGUAGES = split(/\s+/, $1) if /^\s*ddtplangs="?([^\"]*)"?\s*$/o; @SUITES = split(/\s+/, $1) if /^\s*suites="?([^\"]*)"?\s*$/o; @SECTIONS = split(/\s+/, $1) if /^\s*sections="?([^\"]*)"?\s*$/o; @ARCHIVES = split(/\s+/, $1) if /^\s*archives="?([^\"]*)"?\s*$/o;