rockbox/tools/genlang

767 lines
22 KiB
Perl
Executable File

#!/usr/bin/perl -s
# __________ __ ___.
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#
# See apps/language.c (TODO: Use common include for both)
# Cookie and binary version for the binary lang file
my $LANGUAGE_COOKIE = 0x1a;
my $VOICE_COOKIE = 0x9a;
my $LANGUAGE_VERSION = 0x06;
my $LANGUAGE_FLAG_RTL = 0x01;
my $HEADER_SIZE = 4;
my $SUBHEADER_SIZE = 6;
# A note for future users and readers: The original v1 language system allowed
# the build to create and use a different language than english built-in. We
# removed that feature from our build-system, but the build scripts still had
# the ability. But, starting now, this ability is no longer provided since I
# figured it was boring and unnecessary to write support for now since we
# don't use it anymore.
if(!$ARGV[0]) {
print <<MOO
Usage: genlang [options] <langv2 file>
-p=<prefix>
Make the tool create a [prefix].c and [prefix].h file.
-b=<outfile>
Make the tool create a binary language (.lng) file named [outfile].
The use of this option requires that you also use -e, -t and -i.
-c=<outfile>
Create binary voicestring file named [outfile]. Works like -b and can be
used the same time.
-e=<english lang file>
Point out the english (original source) file, to use that as master
language template. Used in combination with -b, -u or -s.
-s
Sort the Update language file in the same order as the strings in the
English file.
-t=<target>
Specify which target you want the translations/phrases for. Required when
-b or -p is used.
The target can in fact be specified as numerous different strings,
separated with colons. This will make genlang to use all the specified
strings when searching for a matching phrase.
-i=<target id>
The target id number, needed for -b.
-o
Voice mode output. Outputs all id: and voice: lines for the given target!
-v
Enables verbose (debug) output.
MOO
;
exit;
}
# How update works:
#
# 1) scan the english file, keep the whole <phrase> for each phrase.
# 2) read the translated file, for each end of phrase, compare:
# A) all source strings, if there's any change there should be a comment about
# it output
# B) the desc fields
#
# 3) output the phrase with the comments from above
# 4) check which phrases that the translated version didn't have, and spit out
# the english version of those
#
my $prefix = $p;
my $binary = $b;
my $sortfile = $s;
my $binvoice = $c;
my $english = $e;
my $voiceout = $o;
my $check = ($binary?.5:0) + ($prefix?1:0) + ($voiceout?1:0) + ($sortfile?1:0) + ($binvoice?.5:0);
if($check > 1) {
print STDERR "Please use only one of -p, -o, -b, -c and -s\n";
exit;
}
if(!$check) {
print STDERR "Please use at least one of -p, -o, -c, -e and -s\n";
exit;
}
if(($binary || $voiceout || $sortfile) && !$english) {
print STDERR "Please use -e too when you use -b, -o, -u or -s\n";
exit;
}
my $target_id = $i;
if($binary && !$target_id) {
print STDERR "Please specify a target id number (with -i)!\n";
exit;
}
my $target = $t;
if(!$target && !$sortfile) {
print STDERR "Please specify a target (with -t)!\n";
exit;
}
# Build up a regex which can be applied to target wildcard lists. We only need
# to support prefix matches, so a target parameter of foo:bar can be expanded
# to the regex "\*|f\*|fo\*|foo|b\*|ba\*|bar" and applied to the wildcard list
# (plus end-of-string or commas on either side). The regex engine should
# discard any duplicates generated for us in the process of constructing the
# state machine, so we don't bother to check.
my $target_regex = "(?:^|,) *(?:\\*";
foreach my $target_part (split ':', $target) {
for (my $c=1; $c<=length $target_part; $c++) {
my $partial = substr $target_part, 0, $c;
$target_regex .= "|$partial\\*";
}
$target_regex .= "|$target_part";
}
$target_regex .= ") *(?:,|\$)";
$target_regex = qr/$target_regex/;
my $binpath = "";
if ($binary =~ m|(.*)/[^/]+|) {
$binpath = $1;
}
my $verbose=$v;
my %id; # string to num hash
my @idnum; # num to string array
my %allphrases; # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash
my %users =
('core' => 0);
my $input = $ARGV[0];
my @m;
my $m="blank";
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub blank {
# nothing to do
}
my %head;
sub header {
my ($full, $n, $v)=@_;
$head{$n}=$v;
}
my %phrase;
sub phrase {
my ($full, $n, $v)=@_;
$phrase{$n}=$v;
}
my %options;
sub options {
my ($full, $n, $v)=@_;
$options{$n}=$v;
}
sub parsetarget {
my ($debug, $strref, $full, $n, $v)=@_;
my $string;
if ($n =~ $target_regex) {
$string = $v;
$$strref = $string;
return $string;
}
}
my $src;
sub source {
parsetarget("src", \$src, @_);
}
my $dest;
sub dest {
parsetarget("dest", \$dest, @_);
}
my $voice;
sub voice {
parsetarget("voice", \$voice, @_);
}
sub file_is_newer {
my ($file1, $file2) = @_;
my @s1 = stat $file1;
my @s2 = stat $file2;
return 1 if ($s1[9] > $s2[9]);
return 0;
}
my %idmap;
my %english;
if($english) {
readenglish();
}
sub readenglish {
# For the cases where the english file needs to be scanned/read, we do
# it before we read the translated file. For -b it isn't necessary, but for
# -u it is convenient.
my @idnum = ((0)); # start with a true number
my @vidnum = ((0x8000)); # first voice id
if ($binary and file_is_newer("$binpath/english.list", $english)) {
open(ENG, "<$binpath/english.list") ||
die "Error: can't open $binpath/english.list";
while (<ENG>) {
my ($user, $id, $value) = split ':', $_;
$idmap[$user]{$id} = $value;
$english{$id} = 1;
}
close ENG;
return;
}
open(ENG, "<$english") || die "Error: can't open $english";
my @phrase;
my $id;
my $maybeid;
my $user;
my $withindest;
my $numphrases = 0;
my $numusers = 1; # core is already in the users map
while(<ENG>) {
# get rid of DOS newlines
$_ =~ tr/\r//d;
if($_ =~ /^ *\<phrase\>/) {
# this is the start of a phrase
}
elsif($_ =~ /\<\/phrase\>/) {
# if id is something, when we count and store this phrase
if($id) {
# voice-only entries get a difference range
if($id =~ /^VOICE_/) {
# Assign an ID number to this entry
$idmap[$user]{$id}=$vidnum[$user];
$vidnum[$user]++;
}
else {
# Assign an ID number to this entry
$idmap[$user]{$id}=$idnum[$user];
$idnum[$user]++;
# print STDERR "DEST: bumped idnum to $idnum[$user]\n";
}
# this is the end of a phrase, add it to the english hash
$english{$id}=join("", @phrase);
}
undef @phrase;
$id="";
}
elsif($_ ne "\n") {
# gather everything related to this phrase
push @phrase, $_;
if($_ =~ /^ *\<dest\>/i) {
$withindest=1;
$deststr="";
}
elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
$withindest=0;
if($deststr && ($deststr !~ /^none\z/i)) {
# we unconditionally always use all IDs when the "update"
# feature is used
$id = $maybeid;
# print "DEST: use this id $id\n";
}
else {
# print "skip $maybeid for $name\n";
}
}
elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
my ($name, $val)=($1, $2);
$dest=""; # in case it is left untouched for when the
# model name isn't "our"
dest($_, $name, $val);
if($dest) {
# Store the current dest string. If this target matches
# multiple strings, it will get updated several times.
$deststr = $dest;
}
}
}
if($_ =~ /^ *id: ([^ \t\n]+)/i) {
$maybeid=$1;
$sortorder{$maybeid}=$numphrases++;
}
if($_ =~ /^ *user: ([^ \t\n]+)/i) {
$user = $users{$1};
if(!(defined $user)) {
$user = ++$numusers;
$users{$1} = $user;
}
}
}
close(ENG);
}
my @idcount; # counter for lang ID numbers
my @voiceid; # counter for voice-only ID numbers
for (keys %users) {
push @idcount, 0;
push @voiceid, 0x8001;
}
#
# Now start the scanning of the selected language string
#
open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
my @phrase;
my $header = 1;
my $langoptions = 0;
while(<LANG>) {
$line++;
# get rid of DOS newlines
$_ =~ tr/\r//d;
if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
# comment or empty line - output it if it's part of the header
if ($header and $sortfile) {
print($_);
}
next;
}
$header = 0;
my $ll = $_;
# print "M: $m\n";
push @phrase, $ll;
# this is an XML-lookalike tag
if (/^(<|[^\"<]+<)([^>]*)>/) {
my $part = $2;
# print "P: $part\n";
if($part =~ /^\//) {
# this was a closing tag
if($part eq "/phrase") {
# closing the phrase
my $idstr = $phrase{'id'};
my $idnum;
if(($binary || $binvoice || $voiceout) && !$english{$idstr}) {
# $idstr doesn't exist for english, skip it\n";
# FIXME/TODO: Any reason this filter shouldn't always be enabled?
}
elsif($dest =~ /^none\z/i) {
# "none" as dest (without quotes) means that this entire
# phrase is to be ignored
}
elsif($sortfile) {
$allphrases{$idstr}=join('',@phrase);
}
else {
# allow the keyword 'deprecated' to be used on dest and
# voice strings to mark that as deprecated. It will then
# be replaced with "".
$dest =~ s/^deprecate(|d)\z/\"\"/i;
$voice =~ s/^deprecate(|d)\z/\"\"/i;
# basic syntax error alerts, if there are no quotes we
# will assume an empty string was intended
if($dest !~ /^\"/) {
print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
$dest='""';
}
if($src !~ /^\"/) {
print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
$src='""';
}
if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
$voice='""';
}
if($dest eq '""' && $phrase{'desc'} !~ /deprecated/i && $idstr !~ /^VOICE/) {
print STDERR "$input:$line:1: warning: empty dest before line in non-deprecated phrase!\n";
}
my $userstr = trim($phrase{'user'});
my $user = $users{$userstr};
if ($userstr eq "") {
print STDERR "$input:$line:1: warning: missing user!\n";
$user = $users{"core"};
}
elsif(!(defined $user)) {
if($english) {
print STDERR "$input:$line:1: warning: user was not found in $english!\n";
$user = keys %users; # set to an invalid user so it won't be added
}
else {
# we found a new user, add it to the usermap
$user = ++$numusers;
$users{$userstr} = $user;
}
}
# Use the ID name to figure out which id number range we
# should use for this phrase. Voice-only strings are
# separated.
if($idstr =~ /^VOICE/) {
$idnum = $voiceid[$user]++;
}
else {
$idnum = $idcount[$user]++;
}
$id{$idstr} = $idnum;
$idnum[$user][$idnum]=$idstr;
$source{$idstr}=$src;
$dest{$idstr}=$dest;
$voice{$idstr}=$voice;
if($verbose) {
print "id: $phrase{id} ($idnum)\n";
print "source: $src\n";
print "dest: $dest\n";
print "voice: $voice\n";
print "user: $user\n";
}
undef $src;
undef $dest;
undef $voice;
undef $user;
undef %phrase;
}
undef @phrase;
} # end of </phrase>
elsif($part eq "/options") {
# closing the options
if ($options{'rtl'}) {
$langoptions |= $LANGUAGE_FLAG_RTL;
}
} # end of </options>
# starts with a slash, this _ends_ this section
$m = pop @m; # get back old value, the previous level's tag
next;
} # end of tag close
# This is an opening (sub) tag
push @m, $m; # store old value
$m = $part;
next;
}
if(/^ *([^:]+): *(.*)/) {
my ($name, $val)=($1, $2);
&$m($_, $name, $val);
}
}
close(LANG);
if ($sortfile) {
for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
print $allphrases{$_};
}
}
if($prefix) {
# We create a .c and .h file
open(HFILE_CORE, ">$prefix/lang.h") ||
die "Error: couldn't create file $prefix/lang.h\n";
open(CFILE_CORE, ">$prefix/lang_core.c") ||
die "Error: couldn't create file $prefix/lang_core.c\n";
# get header file name
$headername = "$prefix/lang.h";
$headername =~ s/(.*\/)*//;
print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
/*
* The str() macro/functions is how to access strings that might be
* translated. Use it like str(MACRO) and expect a string to be
* returned!
*/
#define str(x) language_strings[x]
/* this is the array for holding the string pointers.
It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char core_language_builtin[];
#include "${prefix}_enum.h"
MOO
;
close(HFILE_CORE);
open(HFILE_CORE, ">${prefix}_enum.h") ||
die "couldn't create file ${prefix}_enum.h\n";
print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
#ifndef _LANG_ENUM_H_
#define _LANG_ENUM_H_
/* The enum below contains all available strings */
enum \{
MOO
;
print CFILE_CORE <<MOO
/* This file was automatically generated using genlang, the strings come
from "$input" */
#include "$headername"
unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char core_language_builtin[] =
MOO
;
# Output the ID names for the enum in the header file
my $i;
for $i (0 .. $idcount[$users{"core"}]-1) {
my $name=$idnum[$users{"core"}][$i]; # get the ID name
$name =~ tr/\"//d; # cut off the quotes
printf HFILE_CORE (" %s, /* %d */\n", $name, $i);
}
# Output separation marker for last string ID and the upcoming voice IDs
print HFILE_CORE <<MOO
LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
/* --- below this follows voice-only strings --- */
VOICEONLY_DELIMITER = 0x8000,
MOO
;
# Output the ID names for the enum in the header file
for $i (0x8001 .. ($voiceid[$users{"core"}]-1)) {
my $name=$idnum[$users{"core"}][$i]; # get the ID name
$name =~ tr/\"//d; # cut off the quotes
printf HFILE_CORE (" %s, /* 0x%x */\n", $name, $i);
}
# Output end of lang_enum.h
print HFILE_CORE <<MOO
LANG_LAST_VOICEONLY_INDEX_IN_ARRAY /* this is not a string, this is a marker */
};
/* end of generated enum list */
#endif /* _LANG_ENUM_H_ */
MOO
;
# Output the target phrases for the source file
for $i (0 .. $idcount[$users{"core"}]-1) {
my $name=$idnum[$users{"core"}][$i]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
$dest =~ s:\"$:\\0\":; # insert a \0 before the second quote
if(!$dest) {
# this is just to be on the safe side
$dest = '"\0"';
}
printf CFILE_CORE (" %s\n", $dest);
}
# Output end of string chunk
print CFILE_CORE <<MOO
;
/* end of generated string list */
MOO
;
close(HFILE_CORE);
close(CFILE_CORE);
} # end of the c/h file generation
elsif($binary || $binvoice) {
# Creation of a binary lang file was requested
# We must first scan the english file to get the correct order of the id
# numbers used there, as that is what sets the id order for all language
# files. The english file is scanned before the translated file was
# scanned.
if($binary) {
open(OUTF, ">$binary") or die "Error: Can't create $binary";
binmode OUTF;
printf OUTF ("%c%c%c%c", $LANGUAGE_COOKIE, $LANGUAGE_VERSION, $target_id,
$langoptions); # magic lang file header
}
if($binvoice) {
open(OUTV, ">$binvoice") or die "Error: Can't create $binary";
binmode OUTV;
printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id,
$langoptions); # magic lang file header
}
# output the number of strings for each user
my $foffset = $HEADER_SIZE + $SUBHEADER_SIZE * keys(%users);
for (keys %users) {
my $size;
for $n (0 .. $idcount[$_]-1) {
$size += length(trim($dest{$idnum[$_][$n]})) + 1;
}
if($binary) {
printf OUTF ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
}
if($binvoice) {
printf OUTV ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
}
$foffset += $size;
}
for (keys %users) {
# loop over the target phrases
# This loops over the strings in the translated language file order
my @ids = ((0 .. ($idcount[$_]-1)));
push @ids, (0x8001 .. ($voiceid[$_]-1));
for $n (@ids) {
my $name=$idnum[$_][$n]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
my $voice = $voice{$name}; # get the destination voice string
if($dest && $n < 0x8000 && $binary) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name};
printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest);
}
if($voice && $binvoice) {
$voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name};
printf OUTV ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $voice);
}
}
}
if($binary) {
close(OUTF);
}
if($binvoice) {
close(OUTV);
}
}
elsif($voiceout) {
# voice output requested, display id: and voice: strings in a v1-like
# fashion
my @engl;
for (keys %users) {
# loop over the target phrases
# This loops over the strings in the translated language file order
my @ids = ((0 .. ($idcount[$_]-1)));
push @ids, (0x8001 .. ($voiceid[$_]-1));
for $n (@ids) {
my $name=$idnum[$_][$n]; # get the ID
my $dest = $dest{$name}; # get the destination phrase
my $voice = $voice{$name}; # get the destination voice string
if($voice) {
$voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
# Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name};
$engl[$idnum] = "#$idnum ($n)\nid: $name\nvoice: \"$voice\"\n";
}
}
}
# Print the list in the the English sort order
for (@engl) {
print $_;
}
}
if($verbose) {
my $num_str = 0;
for (keys %users) {
$num_str += $idcount[$_];
}
printf("%d ID strings scanned\n", $num_str);
print "* head *\n";
for(keys %head) {
printf "$_: %s\n", $head{$_};
}
}
if ($binary and !file_is_newer("$binpath/english.list", $english)) {
open(ENGLIST, ">$binpath/english.list") ||
die "Failed creating $binpath/english.list";
for my $user (keys %users) {
for my $id (keys %{$idmap[$user]}) {
print ENGLIST "$user:$id:$idmap[$user]{$id}\n";
}
}
close ENGLIST;
}