summaryrefslogtreecommitdiffstats
path: root/contrib/syslinux-4.02/codepage/gensubset.pl
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/syslinux-4.02/codepage/gensubset.pl')
-rwxr-xr-xcontrib/syslinux-4.02/codepage/gensubset.pl57
1 files changed, 57 insertions, 0 deletions
diff --git a/contrib/syslinux-4.02/codepage/gensubset.pl b/contrib/syslinux-4.02/codepage/gensubset.pl
new file mode 100755
index 0000000..4dd7f2c
--- /dev/null
+++ b/contrib/syslinux-4.02/codepage/gensubset.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+#
+# Generate a subset of the UnicodeData.txt file, available from
+# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+#
+# Usage:
+# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
+#
+
+%need_these = ();
+
+# Mark as needed all the characters mentioned in the relevant files
+foreach $file (@ARGV) {
+ open(F, '<', $file) or die;
+ while (defined($line = <F>)) {
+ $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
+ @f = split(/\s+/, $line);
+ next if (scalar @f != 2);
+ $need_these{hex $f[1]}++;
+ }
+ close(F);
+}
+
+# Also mark as needed any case variants of those
+# (Note: this doesn't necessarily provide the full transitive closure,
+# but we shouldn't need it.)
+while (defined($line = <STDIN>)) {
+ @f = split(/;/, $line);
+ if ($f[0] =~ /^([0-9a-f]+)$/i) {
+ $r = hex $f[0];
+ if ($need_these{$r}) {
+ $need_these{hex $f[12]}++ if ($f[12] ne '');
+ $need_these{hex $f[13]}++ if ($f[13] ne '');
+ $need_these{hex $f[14]}++ if ($f[14] ne '');
+ }
+ }
+}
+
+# Finally, write out the subset
+seek(STDIN, 0, 0);
+while (defined($line = <STDIN>)) {
+ ($v, $l) = split(/;/, $line, 2);
+ if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
+ # This isn't actually the format... fix that if it ever matters
+ $r1 = hex $1;
+ $r2 = hex $2;
+ } elsif ($v =~ /^([0-9a-f]+)$/i) {
+ $r1 = $r2 = hex $1;
+ } else {
+ next;
+ }
+ for ($r = $r1; $r <= $r2; $r++) {
+ printf "%04X;%s", $r, $l if ($need_these{$r});
+ }
+}
+
+