summaryrefslogtreecommitdiffstats
path: root/contrib/syslinux-4.02/codepage/cptable.pl
blob: e29cf006ca56bb2551f39dab941f2b08ea8f162b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/perl
#
# Produce a codepage matching table.  For each 8-bit character, list
# a primary and an alternate match (the latter used for case-insensitive
# matching.)
#
# Usage:
#	cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
#
# Note: for the format of the UnicodeData file, see:
# http://www.unicode.org/Public/UNIDATA/UCD.html
#

($ucd, $cpco, $cpfs, $cpout) = @ARGV;

if (!defined($cpout)) {
    die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
}

%ucase   = ();
%lcase   = ();
%tcase   = ();
%decomp  = ();

open(UCD, '<', $ucd)
    or die "$0: could not open unicode data: $ucd: $!\n";
while (defined($line = <UCD>)) {
    chomp $line;
    @f = split(/;/, $line);
    $n = hex $f[0];
    $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
    $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
    $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
    if ($f[5] =~ /^[0-9A-F\s]+$/) {
	# This character has a canonical decomposition.
	# The regular expression rejects angle brackets, so other
	# decompositions aren't permitted.
	$decomp{$n} = [];
	foreach my $dch (split(' ', $f[5])) {
	    push(@{$decomp{$n}}, hex $dch);
	}
    }
}
close(UCD);

#
# Filesystem and console codepages.  The filesystem codepage is used
# for FAT shortnames, whereas the console codepage is whatever is used
# on the screen and keyboard.
#
@xtab = (undef) x 256;
%tabx = ();
open(CPFS, '<', $cpfs)
    or die "$0: could not open fs codepage: $cpfs: $!\n";
while (defined($line = <CPFS>)) {
    $line =~ s/\s*(\#.*|)$//;
    @f = split(/\s+/, $line);
    next if (scalar @f != 2);
    next if (hex $f[0] > 255);
    $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
    $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
}
close(CPFS);

@ytab = (undef) x 256;
%taby = ();
open(CPCO, '<', $cpco)
    or die "$0: could not open console codepage: $cpco: $!\n";
while (defined($line = <CPCO>)) {
    $line =~ s/\s*(\#.*|)$//;
    @f = split(/\s+/, $line);
    next if (scalar @f != 2);
    next if (hex $f[0] > 255);
    $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
    $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
}
close(CPCO);

open(CPOUT, '>', $cpout)
    or die "$0: could not open output file: $cpout: $!\n";
#
# Magic number, in anticipation of being able to load these
# files dynamically...
#
print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);

# Header fields available for future use...
print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);

#
# Self (shortname) uppercase table.
# This depends both on the console codepage and the filesystem codepage;
# the logical transcoding operation is:
#
# $tabx{$ucase{$ytab[$i]}}
#
# ... where @ytab is console codepage -> Unicode and
# %tabx is Unicode -> filesystem codepage.
#
@uctab = (undef) x 256;
for ($i = 0; $i < 256; $i++) {
    $uuc = $ucase{$ytab[$i]};	# Unicode upper case
    if (defined($tabx{$uuc})) {
	# Straight-forward conversion
	$u = $tabx{$uuc};
    } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
	# Upper case equivalent stripped of accents
	$u = $tabx{${$decomp{$uuc}}[0]};
    } else {
	# No equivalent at all found.  Assume it is a lower-case-only
	# character, like greek alpha in CP437.
	$u = $i;
    }
    $uctab[$i] = $u;
    print CPOUT pack("C", $u);
}

#
# Self (shortname) lowercase table.
# This depends both on the console codepage and the filesystem codepage;
# the logical transcoding operation is:
#
# $taby{$lcase{$xtab[$i]}}
#
# ... where @ytab is console codepage -> Unicode and
# %tabx is Unicode -> filesystem codepage.
#
@lctab = (undef) x 256;
for ($i = 0; $i < 256; $i++) {
    $llc = $lcase{$xtab[$i]};	# Unicode lower case
    if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
	# Straight-forward conversion
    } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
	# Lower case equivalent stripped of accents
    } else {
	# No equivalent at all found.  Find *anything* that matches the
	# bijection criterion...
	for ($l = 0; $l < 256; $l++) {
	    last if ($uctab[$l] == $i);
	}
	$l = $i if ($l == 256);	# If nothing, we're screwed anyway...
    }
    $lctab[$i] = $l;
    print CPOUT pack("C", $l);
}

#
# Unicode (longname) matching table.
# This only depends on the console codepage.
#
$pp0 = '';  $pp1 = '';
for ($i = 0; $i < 256; $i++) {
    if (!defined($ytab[$i])) {
	$p0 = $p1 = 0xffff;
    } else {
	$p0 = $ytab[$i];
	if ($ucase{$p0} != $p0) {
	    $p1 = $ucase{$p0};
	} elsif ($lcase{$p0} != $p0) {
	    $p1 = $lcase{$p0};
	} elsif ($tcase{$p0} != $p0) {
	    $p1 = $tcase{$p0};
	} else {
	    $p1 = $p0;
	}
    }
    # Only the BMP is supported...
    $p0 = 0xffff if ($p0 > 0xffff);
    $p1 = 0xffff if ($p1 > 0xffff);
    $pp0 .= pack("v", $p0);
    $pp1 .= pack("v", $p1);
}
print CPOUT $pp0, $pp1;
close (CPOUT);