16 a state machine for interpreting all sorts of encodings
19 alljis(int c, Rune **r, long input_loc)
21 static enum { state0, state1, state2, state3, state4 } state = state0;
23 static int japan646 = 0;
31 case state0: /* idle state */
32 if(c == ESC){ state = state1; return; }
34 if(!set8 && (c < 128)){
38 case '\\': emit(0xA5); return; /* yen */
39 case '~': emit(0xAF); return; /* spacing macron */
40 default: emit(c); return;
47 if(c < 0x21){ /* guard against bogus characters in JIS mode */
49 EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
53 lastc = c; state = state4; return;
55 case state1: /* seen an escape */
56 if(c == '$'){ state = state2; return; }
57 if(c == '('){ state = state3; return; }
58 emit(ESC); state = state0; goto again;
60 case state2: /* may be shifting into JIS */
61 if((c == '@') || (c == 'B')){
62 set8 = 1; state = state0; return;
64 emit(ESC); emit('$'); state = state0; goto again;
66 case state3: /* may be shifting out of JIS */
67 if((c == 'J') || (c == 'H') || (c == 'B')){
68 japan646 = (c == 'J');
69 set8 = 0; state = state0; return;
71 emit(ESC); emit('('); state = state0; goto again;
73 case state4: /* two part char */
76 EPR "%s: unexpected EOF in %s\n", argv0, file);
77 c = 0x21 | (lastc&0x80);
79 if(CANS2J(lastc, c)){ /* ms dos sjis */
80 int hi = lastc, lo = c;
81 S2J(hi, lo); /* convert to 208 */
82 n = hi*100 + lo - 3232; /* convert to kuten208 */
84 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
85 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
88 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
95 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
104 a state machine for interpreting ms-kanji == shift-jis.
107 ms(int c, Rune **r, long input_loc)
109 static enum { state0, state1, state2, state3, state4 } state = state0;
111 static int japan646 = 0;
119 case state0: /* idle state */
120 if(c == ESC){ state = state1; return; }
122 if(!set8 && (c < 128)){
126 case '\\': emit(0xA5); return; /* yen */
127 case '~': emit(0xAF); return; /* spacing macron */
128 default: emit(c); return;
135 lastc = c; state = state4; return;
137 case state1: /* seen an escape */
138 if(c == '$'){ state = state2; return; }
139 if(c == '('){ state = state3; return; }
140 emit(ESC); state = state0; goto again;
142 case state2: /* may be shifting into JIS */
143 if((c == '@') || (c == 'B')){
144 set8 = 1; state = state0; return;
146 emit(ESC); emit('$'); state = state0; goto again;
148 case state3: /* may be shifting out of JIS */
149 if((c == 'J') || (c == 'H') || (c == 'B')){
150 japan646 = (c == 'J');
151 set8 = 0; state = state0; return;
153 emit(ESC); emit('('); state = state0; goto again;
155 case state4: /* two part char */
158 EPR "%s: unexpected EOF in %s\n", argv0, file);
159 c = 0x21 | (lastc&0x80);
161 if(CANS2J(lastc, c)){ /* ms dos sjis */
162 int hi = lastc, lo = c;
163 S2J(hi, lo); /* convert to 208 */
164 n = hi*100 + lo - 3232; /* convert to kuten208 */
168 EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
174 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
177 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
184 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
193 a state machine for interpreting ujis == EUC
196 ujis(int c, Rune **r, long input_loc)
198 static enum { state0, state1 } state = state0;
205 case state0: /* idle state */
211 if(c == 0x8e){ /* codeset 2 */
214 EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
219 if(c == 0x8f){ /* codeset 3 */
222 EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
231 case state1: /* two part char */
234 EPR "%s: unexpected EOF in %s\n", argv0, file);
237 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
238 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
241 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
248 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
257 a state machine for interpreting jis-kanji == 2022-JP
260 jis(int c, Rune **r, long input_loc)
262 static enum { state0, state1, state2, state3, state4 } state = state0;
264 static int japan646 = 0;
272 case state0: /* idle state */
273 if(c == ESC){ state = state1; return; }
275 if(!set8 && (c < 128)){
279 case '\\': emit(0xA5); return; /* yen */
280 case '~': emit(0xAF); return; /* spacing macron */
281 default: emit(c); return;
288 lastc = c; state = state4; return;
290 case state1: /* seen an escape */
291 if(c == '$'){ state = state2; return; }
292 if(c == '('){ state = state3; return; }
293 emit(ESC); state = state0; goto again;
295 case state2: /* may be shifting into JIS */
296 if((c == '@') || (c == 'B')){
297 set8 = 1; state = state0; return;
299 emit(ESC); emit('$'); state = state0; goto again;
301 case state3: /* may be shifting out of JIS */
302 if((c == 'J') || (c == 'H') || (c == 'B')){
303 japan646 = (c == 'J');
304 set8 = 0; state = state0; return;
306 emit(ESC); emit('('); state = state0; goto again;
308 case state4: /* two part char */
311 EPR "%s: unexpected EOF in %s\n", argv0, file);
312 c = 0x21 | (lastc&0x80);
314 if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
319 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
320 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
323 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
330 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
350 while((n = read(fd, ibuf, sizeof ibuf)) > 0){
351 for(i = 0; i < n; i++){
352 (*procfn)(ibuf[i], &r, nin++);
363 (*procfn)(-1, &r, nin);
370 jis_in(int fd, long *notused, struct convert *out)
373 do_in(fd, alljis, out);
377 ujis_in(int fd, long *notused, struct convert *out)
380 do_in(fd, ujis, out);
384 msjis_in(int fd, long *notused, struct convert *out)
391 jisjis_in(int fd, long *notused, struct convert *out)
397 static int first = 1;
406 for(i = 0; i < NRUNE; i++)
408 for(i = 0; i < KUTEN208MAX; i++)
409 if((l = tabkuten208[i]) != -1){
418 /* jis-kanji, or ISO 2022-JP */
420 jisjis_out(Rune *base, int n, long *notused)
425 static enum { ascii, japan646, jp2022 } state = ascii;
432 for(i = 0; i < n; i++){
436 *p++ = ESC; *p++ = '('; *p++ = 'B';
443 *p++ = ESC; *p++ = '$'; *p++ = 'B';
446 *p++ = tab[r]/100 + ' ';
447 *p++ = tab[r]%100 + ' ';
451 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
460 write(1, obuf, p-obuf);
463 /* ms-kanji, or Shift-JIS */
465 msjis_out(Rune *base, int n, long *notused)
476 for(i = 0; i < n; i++){
482 hi = tab[r]/100 + ' ';
483 lo = tab[r]%100 + ' ';
490 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
499 write(1, obuf, p-obuf);
504 ujis_out(Rune *base, int n, long *notused)
515 for(i = 0; i < n; i++){
521 *p++ = 0x80 | (tab[r]/100 + ' ');
522 *p++ = 0x80 | (tab[r]%100 + ' ');
526 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
535 write(1, obuf, p-obuf);