diff options
Diffstat (limited to 'contrib/syslinux-4.02/codepage/gensubset.pl')
-rwxr-xr-x | contrib/syslinux-4.02/codepage/gensubset.pl | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/contrib/syslinux-4.02/codepage/gensubset.pl b/contrib/syslinux-4.02/codepage/gensubset.pl new file mode 100755 index 0000000..4dd7f2c --- /dev/null +++ b/contrib/syslinux-4.02/codepage/gensubset.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl +# +# Generate a subset of the UnicodeData.txt file, available from +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt +# +# Usage: +# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt +# + +%need_these = (); + +# Mark as needed all the characters mentioned in the relevant files +foreach $file (@ARGV) { + open(F, '<', $file) or die; + while (defined($line = <F>)) { + $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks + @f = split(/\s+/, $line); + next if (scalar @f != 2); + $need_these{hex $f[1]}++; + } + close(F); +} + +# Also mark as needed any case variants of those +# (Note: this doesn't necessarily provide the full transitive closure, +# but we shouldn't need it.) +while (defined($line = <STDIN>)) { + @f = split(/;/, $line); + if ($f[0] =~ /^([0-9a-f]+)$/i) { + $r = hex $f[0]; + if ($need_these{$r}) { + $need_these{hex $f[12]}++ if ($f[12] ne ''); + $need_these{hex $f[13]}++ if ($f[13] ne ''); + $need_these{hex $f[14]}++ if ($f[14] ne ''); + } + } +} + +# Finally, write out the subset +seek(STDIN, 0, 0); +while (defined($line = <STDIN>)) { + ($v, $l) = split(/;/, $line, 2); + if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) { + # This isn't actually the format... fix that if it ever matters + $r1 = hex $1; + $r2 = hex $2; + } elsif ($v =~ /^([0-9a-f]+)$/i) { + $r1 = $r2 = hex $1; + } else { + next; + } + for ($r = $r1; $r <= $r2; $r++) { + printf "%04X;%s", $r, $l if ($need_these{$r}); + } +} + + |