RhythmHeavenSilver/tools/abcde/abcde.pl

265 lines
11 KiB
Perl

#!/usr/bin/perl
# abw's basic custom data encoder
package abcde;
use strict;
use warnings;
my $usage = <<USAGE;
usage: $0 <options> [-cm <command module> [<command module options>]]
In its most basic form, this program reads data from STDIN, translates it according to the provided table files and
other general options, and writes it to STDOUT. This form is useful for integration into other programs (e.g. hex
editors, (dis)assemblers, etc.) and for examining the translation process in detail, but is unlikely to be of much
interest to the average user as its scope is limited to the translation of data that should be translated and makes
no attempt to e.g. update pointers or exclude portions of the input data from translation.
The average user is more likely to be interested in the command modules. Currently, varying levels of support for Atlas
and Cartographer command files have been implemented; see those modules' documentation for more details.
Available required options:
-m <mode> / --mode <mode>: select operation mode; <mode> value can be either 'bin2text', in which case binary data on
STDIN is translated to text on STDOUT, or 'text2bin', in which case text data on STDIN is translated to binary
data on STDOUT.
-t <table> / --table <table file>: specify the file name of a table file to use in translation. Include this option
once for each table file to be used in translation. Note that when provided on the command line, relative
paths are assumed to be anchored at the current directory.
Available optional options:
-g / --group: instead of normal output, print the output grouped by token (primarily used for debug, but also available
for interest's sake).
-h / --help: print this help text and exit; if used in conjunction with -cm <command module name>, instead print help
text for the specified command module and exit.
-brp / --base-relative-path <path>: sets the base relative path used by command modules to <path> instead of the
current directory; see command module documentation for details including any special <path> values.
-s / --stats: after normal output, print some statistics to STDERR.
--artificial-end-token <label>: <label> used to indicate the logical end of a fixed-length string during insertion.
--multi-table-files: allow multiple logical tables inside a single table file.
--no-unicode-normalization: if set, prevents the default canonical decomposition (NFD) of all text input and canonical
composition (NFC) of all text output
--start-table-file-name: specify the file name of the table to be used as the starting table for translation; if not
specified, this defaults to the first table provided on the command line with -t.
Available command modules:
abcde::Atlas: Atlas emulation; implies --mode text2bin
abcde::Cartographer: Cartographer emulation; implies --mode bin2text
Available command module options:
All options following the command module name are passed directly to the command module; see the documentation for the
selected command module for details.
USAGE
# TBD:
# also implement longest-leftmost insert (valid for single table + all table entries have same binary width + no normal table entry has > 2 characters + all characters used in normal entries have an entry with just that character)
# also implement dynamic programming insert (valid for single table + no table entry's binary is a prefix of any other table entry's binary)
# also implement insert algorithm selection, defaulting to the fastest valid algorithm
# add string format support (non-text prefixes, with special case for pascal string lengths)
# add more pointer support (ability to specify where individual bits of pointer should be read from)
# update Atlas/Cartographer modules to support bit-oriented operations
# add more/better debug output (especially for Atlas, which has a debug flag) and stats
# ability to import one table file into another (useful e.g. when the game uses dozens of table files that differ only on a couple of control codes)
# increase support for pointer arithmetic (e.g. games that store P in ROM and then access e.g. P/2)
# combining characters stored on line N lines above the characters they combine with, e.g. JP dakuten marks written above their kana in NES games
# add check for writing the same address twice
# convert Cartographer's exclusive addresses to inclusive to match Atlas' behaviour
BEGIN {
require Time::HiRes;
$abcde::stats{totalTime} = Time::HiRes::gettimeofday();
$| = 1;
require File::Basename;
require File::Spec;
my $dir = File::Spec->catdir(File::Basename::dirname(File::Spec->rel2abs(__FILE__)), '.');
# make sure our pre-packaged CPAN modules and abcde::* are in @INC before anything else the user might have installed on their system
unshift(@INC, File::Spec->catdir($dir.'/cpan'), $dir);
binmode(STDERR, ':utf8');
}
{
# for debug only
no warnings qw(once);
require Data::Dumper;
$Data::Dumper::Sortkeys = 1;
}
use Encode ();
use abcde::Table::Table;
use abcde::Table::Token;
my @tableFileNames = ();
my $startTable;
my $startTableFileName;
my $printHelp = 0;
# default values
%abcde::tablesByFileName = ();
%abcde::tablesByID = ();
@abcde::allTables = ();
%abcde::internalTokens = ();
$abcde::commandFileType = '';
@abcde::commandFileArgs = ();
%abcde::options = (
allowMultipleTablesPerFile => 0,
artificial_end_token => undef,
group => 0,
mode => undef,
printStats => 0,
baseRelativePath => File::Spec->curdir(),
unicode_normalization => 1,
);
$abcde::reserved_chars = { open => '<', close => '>' }; # I might make this configurable later
# read options from command line
#@ARGV = map { Encode::decode_utf8($_, 1) } @ARGV;
if (!@ARGV) {
die $usage;
}
while (my $arg = shift(@ARGV)) {
if ($arg eq '--artificial-end-token') {
($abcde::options{artificial_end_token} = shift(@ARGV)) =~ s/(?:\\n)*$//;
if ($abcde::options{artificial_end_token} !~ /^$abcde::reserved_chars->{open}(?:)[^$abcde::reserved_chars->{open}$abcde::reserved_chars->{close}]+$abcde::reserved_chars->{close}$/) {
die ("invalid artificial-end-token label");
}
} elsif ($arg eq '-cm') {
$abcde::commandFileType = shift(@ARGV);
@abcde::commandFileArgs = @ARGV;
last;
} elsif ($arg eq '-g' || $arg eq '--group') {
$abcde::options{group} = 1;
} elsif ($arg eq '-h' || $arg eq '--help') {
$printHelp = 1;
} elsif ($arg eq '-m' || $arg eq '--mode') {
$abcde::options{mode} = lc(shift(@ARGV));
} elsif ($arg eq '--multi-table-files') {
$abcde::options{allowMultipleTablesPerFile} = 1;
} elsif ($arg eq '--no-unicode-normalization') {
$abcde::options{unicode_normalization} = 0;
} elsif ($arg eq '-brp' || $arg eq '--base-relative-path') {
$abcde::options{baseRelativePath} = shift(@ARGV);
} elsif ($arg eq '-s' || $arg eq '--stats') {
$abcde::options{printStats} = 1;
} elsif ($arg eq '--start-table-file-name') {
$startTableFileName = shift(@ARGV);
} elsif ($arg eq '-t' || $arg eq '--table') {
push(@tableFileNames, shift(@ARGV));
} else {
die ("unknown option '$arg'\n\n$usage");
}
}
# start validating command line args
if ($abcde::commandFileType) {
my @parts = split(/'|::/, $abcde::commandFileType);
foreach (@parts) {
die ("invalid command module name") if ($_ !~ /^[A-Za-z_][A-Za-z0-9_]*$/);
}
eval "require $abcde::commandFileType" or die ("error loading command module '$abcde::commandFileType': $@");
$abcde::commandFileType->updateOptions();
}
if ($printHelp) {
if ($abcde::commandFileType) {
$abcde::commandFileType->printHelp();
} else {
print STDOUT $usage;
}
exit;
}
if (!defined($abcde::options{mode}) || $abcde::options{mode} !~ /^(?:bin2text|text2bin)$/) {
die ("missing or unknown mode!");
}
# TBD: are these actually useful anymore?
# set up internal tokens
# FB => no-match fallback token
# FC => expired count fallback token
# FF => forced fallback token
# FI => immediate fallback
foreach my $flag (qw(FB FC FF FI)) {
$abcde::internalTokens{$flag} = abcde::Table::Token->create(undef, {$flag => 1}, '', 0, '');
}
# set up tables
$abcde::minBinPerText = 0.25; # heuristic factor for multi-table insertion; initialized to 1 bit per 4 characters, which is what $abcde::rawTable produces
# $abcde::rawTable and $abcde::rawBitTable must exist before calling finalize() on any other tables
$abcde::rawTable = abcde::Table::Table->generateRawTable(); # contains both bit and byte entries
$abcde::rawBitTable = abcde::Table::Table->generateRawBitTable(); # contains only bit entries; used for bin2text
$abcde::rawTable->finalize();
$abcde::rawBitTable->finalize();
# add any tables passed on the command line so that they will be available for command modules too
foreach my $fileName (@tableFileNames) {
$fileName = File::Spec->rel2abs(File::Spec->catdir($abcde::options{baseRelativePath}, $fileName));
my $tables = abcde::Table::Table->addTableFile($fileName);
$startTable ||= $tables->[0];
}
if ($abcde::commandFileType) {
$abcde::commandFileType->handleCommandFile(\@abcde::commandFileArgs);
} else {
# do stuff that can't be done until we've finished parsing all of the table files
foreach my $table (@abcde::allTables) {
$table->finalize();
}
if (defined($startTableFileName)) {
if ($abcde::tablesByFileName{$startTableFileName}) {
$startTable = $abcde::tablesByFileName{$startTableFileName}->[0];
} else {
die ("--start-table-file-name was given, but no table with that file name exists");
}
} else {
$startTable ||= $abcde::rawTable;
}
# read input, defaulting to translating all of STDIN
{
local $/ = undef;
if ($abcde::options{mode} eq 'bin2text') {
binmode(STDIN, ':raw');
binmode(STDOUT, ':utf8');
} elsif ($abcde::options{mode} eq 'text2bin') {
binmode(STDIN, ':utf8');
binmode(STDOUT, ':raw');
}
$abcde::data = <STDIN>;
if ($abcde::options{mode} eq 'bin2text') {
$abcde::data = unpack('B*', $abcde::data); # bloat input up to 1 character per byte cuz working with bits in perl is super annoying; TBD: this is probably the best place to start when optimizing RAM usage
} elsif ($abcde::options{mode} eq 'text2bin') {
$abcde::data =~ s/\R//g; # ignore newlines during text2bin
}
}
# create a String object, parse our input, print out the result
require abcde::String::BasicString;
my $string = abcde::String::BasicString->create($startTable);
$string->parse();
my $output = $string->output();
if ($abcde::options{mode} eq 'text2bin' && !$abcde::options{group}) {
$output = pack('B*', $output); # convert characters to binary
}
print STDOUT $output;
}
if ($abcde::options{printStats}) {
# print stats
$abcde::stats{totalTime} = (Time::HiRes::gettimeofday() - $abcde::stats{totalTime});
print STDERR "\n=== Stats ($abcde::options{mode}) ===\n";
print STDERR "\nBlock\tStrings\tBits\tTokens\tChars\tTime (s)\n";
foreach my $block_name (sort keys %{$abcde::stats{blocks}}) {
my $block = $abcde::stats{blocks}->{$block_name};
print STDERR "$block_name:\t$block->{strings}\t$block->{bits}\t$block->{tokens}\t$block->{chars}\t$block->{time} seconds\n";
}
print STDERR "Total\t$abcde::stats{totalStrings}\t$abcde::stats{totalBits}\t$abcde::stats{totalTokens}\t$abcde::stats{totalChars}\t".sprintf('%.3f', $abcde::stats{totalTime})." seconds\n";
my $other_stats = "";
foreach my $key (sort keys %abcde::stats) {
next if ($key =~ /^(?:blocks|totalStrings|totalBits|totalTokens|totalChars|times|totalTime)$/);
$other_stats .= "$key: $abcde::stats{$key}\n";
}
print STDERR "\nOther stats:\n$other_stats" if ($other_stats);
}