Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #else
6 #include <stdio.h>
7 #include <unistd.h>
8 #include "plan9.h"
9 #endif
10 #include "hdr.h"
11 #include "conv.h"
12 #include "kuten208.h"
13 #include "jis.h"
15 /*
16 a state machine for interpreting all sorts of encodings
17 */
18 static void
19 alljis(int c, Rune **r, long input_loc)
20 {
21 static enum { state0, state1, state2, state3, state4 } state = state0;
22 static int set8 = 0;
23 static int japan646 = 0;
24 static int lastc;
25 int n;
26 long l;
28 again:
29 switch(state)
30 {
31 case state0: /* idle state */
32 if(c == ESC){ state = state1; return; }
33 if(c < 0) return;
34 if(!set8 && (c < 128)){
35 if(japan646){
36 switch(c)
37 {
38 case '\\': emit(0xA5); return; /* yen */
39 case '~': emit(0xAF); return; /* spacing macron */
40 default: emit(c); return;
41 }
42 } else {
43 emit(c);
44 return;
45 }
46 }
47 if(c < 0x21){ /* guard against bogus characters in JIS mode */
48 if(squawk)
49 EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
50 emit(c);
51 return;
52 }
53 lastc = c; state = state4; return;
55 case state1: /* seen an escape */
56 if(c == '$'){ state = state2; return; }
57 if(c == '('){ state = state3; return; }
58 emit(ESC); state = state0; goto again;
60 case state2: /* may be shifting into JIS */
61 if((c == '@') || (c == 'B')){
62 set8 = 1; state = state0; return;
63 }
64 emit(ESC); emit('$'); state = state0; goto again;
66 case state3: /* may be shifting out of JIS */
67 if((c == 'J') || (c == 'H') || (c == 'B')){
68 japan646 = (c == 'J');
69 set8 = 0; state = state0; return;
70 }
71 emit(ESC); emit('('); state = state0; goto again;
73 case state4: /* two part char */
74 if(c < 0){
75 if(squawk)
76 EPR "%s: unexpected EOF in %s\n", argv0, file);
77 c = 0x21 | (lastc&0x80);
78 }
79 if(CANS2J(lastc, c)){ /* ms dos sjis */
80 int hi = lastc, lo = c;
81 S2J(hi, lo); /* convert to 208 */
82 n = hi*100 + lo - 3232; /* convert to kuten208 */
83 } else
84 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
85 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
86 nerrors++;
87 if(squawk)
88 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
89 if(!clean)
90 emit(BADMAP);
91 } else {
92 if(l < 0){
93 l = -l;
94 if(squawk)
95 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
96 }
97 emit(l);
98 }
99 state = state0;
103 /*
104 a state machine for interpreting ms-kanji == shift-jis.
105 */
106 static void
107 ms(int c, Rune **r, long input_loc)
109 static enum { state0, state1, state2, state3, state4 } state = state0;
110 static int set8 = 0;
111 static int japan646 = 0;
112 static int lastc;
113 int n;
114 long l;
116 again:
117 switch(state)
119 case state0: /* idle state */
120 if(c == ESC){ state = state1; return; }
121 if(c < 0) return;
122 if(!set8 && (c < 128)){
123 if(japan646){
124 switch(c)
126 case '\\': emit(0xA5); return; /* yen */
127 case '~': emit(0xAF); return; /* spacing macron */
128 default: emit(c); return;
130 } else {
131 emit(c);
132 return;
135 lastc = c; state = state4; return;
137 case state1: /* seen an escape */
138 if(c == '$'){ state = state2; return; }
139 if(c == '('){ state = state3; return; }
140 emit(ESC); state = state0; goto again;
142 case state2: /* may be shifting into JIS */
143 if((c == '@') || (c == 'B')){
144 set8 = 1; state = state0; return;
146 emit(ESC); emit('$'); state = state0; goto again;
148 case state3: /* may be shifting out of JIS */
149 if((c == 'J') || (c == 'H') || (c == 'B')){
150 japan646 = (c == 'J');
151 set8 = 0; state = state0; return;
153 emit(ESC); emit('('); state = state0; goto again;
155 case state4: /* two part char */
156 if(c < 0){
157 if(squawk)
158 EPR "%s: unexpected EOF in %s\n", argv0, file);
159 c = 0x21 | (lastc&0x80);
161 if(CANS2J(lastc, c)){ /* ms dos sjis */
162 int hi = lastc, lo = c;
163 S2J(hi, lo); /* convert to 208 */
164 n = hi*100 + lo - 3232; /* convert to kuten208 */
165 } else {
166 nerrors++;
167 if(squawk)
168 EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
169 if(!clean)
170 emit(BADMAP);
171 state = state0;
172 goto again;
174 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
175 nerrors++;
176 if(squawk)
177 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
178 if(!clean)
179 emit(BADMAP);
180 } else {
181 if(l < 0){
182 l = -l;
183 if(squawk)
184 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
186 emit(l);
188 state = state0;
192 /*
193 a state machine for interpreting ujis == EUC
194 */
195 static void
196 ujis(int c, Rune **r, long input_loc)
198 static enum { state0, state1 } state = state0;
199 static int lastc;
200 int n;
201 long l;
203 switch(state)
205 case state0: /* idle state */
206 if(c < 0) return;
207 if(c < 128){
208 emit(c);
209 return;
211 if(c == 0x8e){ /* codeset 2 */
212 nerrors++;
213 if(squawk)
214 EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
215 if(!clean)
216 emit(BADMAP);
217 return;
219 if(c == 0x8f){ /* codeset 3 */
220 nerrors++;
221 if(squawk)
222 EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
223 if(!clean)
224 emit(BADMAP);
225 return;
227 lastc = c;
228 state = state1;
229 return;
231 case state1: /* two part char */
232 if(c < 0){
233 if(squawk)
234 EPR "%s: unexpected EOF in %s\n", argv0, file);
235 c = 0xA1;
237 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
238 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
239 nerrors++;
240 if(squawk)
241 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
242 if(!clean)
243 emit(BADMAP);
244 } else {
245 if(l < 0){
246 l = -l;
247 if(squawk)
248 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
250 emit(l);
252 state = state0;
256 /*
257 a state machine for interpreting jis-kanji == 2022-JP
258 */
259 static void
260 jis(int c, Rune **r, long input_loc)
262 static enum { state0, state1, state2, state3, state4 } state = state0;
263 static int set8 = 0;
264 static int japan646 = 0;
265 static int lastc;
266 int n;
267 long l;
269 again:
270 switch(state)
272 case state0: /* idle state */
273 if(c == ESC){ state = state1; return; }
274 if(c < 0) return;
275 if(!set8 && (c < 128)){
276 if(japan646){
277 switch(c)
279 case '\\': emit(0xA5); return; /* yen */
280 case '~': emit(0xAF); return; /* spacing macron */
281 default: emit(c); return;
283 } else {
284 emit(c);
285 return;
288 lastc = c; state = state4; return;
290 case state1: /* seen an escape */
291 if(c == '$'){ state = state2; return; }
292 if(c == '('){ state = state3; return; }
293 emit(ESC); state = state0; goto again;
295 case state2: /* may be shifting into JIS */
296 if((c == '@') || (c == 'B')){
297 set8 = 1; state = state0; return;
299 emit(ESC); emit('$'); state = state0; goto again;
301 case state3: /* may be shifting out of JIS */
302 if((c == 'J') || (c == 'H') || (c == 'B')){
303 japan646 = (c == 'J');
304 set8 = 0; state = state0; return;
306 emit(ESC); emit('('); state = state0; goto again;
308 case state4: /* two part char */
309 if(c < 0){
310 if(squawk)
311 EPR "%s: unexpected EOF in %s\n", argv0, file);
312 c = 0x21 | (lastc&0x80);
314 if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
315 emit(lastc);
316 state = state0;
317 goto again;
319 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
320 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
321 nerrors++;
322 if(squawk)
323 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
324 if(!clean)
325 emit(BADMAP);
326 } else {
327 if(l < 0){
328 l = -l;
329 if(squawk)
330 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
332 emit(l);
334 state = state0;
338 static void
339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
341 Rune ob[N];
342 Rune *r, *re;
343 uchar ibuf[N];
344 int n, i;
345 long nin;
347 r = ob;
348 re = ob+N-3;
349 nin = 0;
350 while((n = read(fd, ibuf, sizeof ibuf)) > 0){
351 for(i = 0; i < n; i++){
352 (*procfn)(ibuf[i], &r, nin++);
353 if(r >= re){
354 OUT(out, ob, r-ob);
355 r = ob;
358 if(r > ob){
359 OUT(out, ob, r-ob);
360 r = ob;
363 (*procfn)(-1, &r, nin);
364 if(r > ob)
365 OUT(out, ob, r-ob);
368 void
369 jis_in(int fd, long *notused, struct convert *out)
371 USED(notused);
372 do_in(fd, alljis, out);
375 void
376 ujis_in(int fd, long *notused, struct convert *out)
378 USED(notused);
379 do_in(fd, ujis, out);
382 void
383 msjis_in(int fd, long *notused, struct convert *out)
385 USED(notused);
386 do_in(fd, ms, out);
389 void
390 jisjis_in(int fd, long *notused, struct convert *out)
392 USED(notused);
393 do_in(fd, jis, out);
396 static int first = 1;
398 static void
399 tab_init(void)
401 int i;
402 long l;
404 first = 0;
405 for(i = 0; i < NRUNE; i++)
406 tab[i] = -1;
407 for(i = 0; i < KUTEN208MAX; i++)
408 if((l = tabkuten208[i]) != -1){
409 if(l < 0)
410 tab[-l] = i;
411 else
412 tab[l] = i;
417 /* jis-kanji, or ISO 2022-JP */
418 void
419 jisjis_out(Rune *base, int n, long *notused)
421 char *p;
422 int i;
423 Rune r;
424 static enum { ascii, japan646, jp2022 } state = ascii;
426 USED(notused);
427 if(first)
428 tab_init();
429 nrunes += n;
430 p = obuf;
431 for(i = 0; i < n; i++){
432 r = base[i];
433 if(r < 128){
434 if(state == jp2022){
435 *p++ = ESC; *p++ = '('; *p++ = 'B';
436 state = ascii;
438 *p++ = r;
439 } else {
440 if(tab[r] != -1){
441 if(state != jp2022){
442 *p++ = ESC; *p++ = '$'; *p++ = 'B';
443 state = jp2022;
445 *p++ = tab[r]/100 + ' ';
446 *p++ = tab[r]%100 + ' ';
447 continue;
449 if(squawk)
450 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
451 nerrors++;
452 if(clean)
453 continue;
454 *p++ = BYTEBADMAP;
457 noutput += p-obuf;
458 if(p > obuf)
459 write(1, obuf, p-obuf);
462 /* ms-kanji, or Shift-JIS */
463 void
464 msjis_out(Rune *base, int n, long *notused)
466 char *p;
467 int i, hi, lo;
468 Rune r;
470 USED(notused);
471 if(first)
472 tab_init();
473 nrunes += n;
474 p = obuf;
475 for(i = 0; i < n; i++){
476 r = base[i];
477 if(r < 128)
478 *p++ = r;
479 else {
480 if(tab[r] != -1){
481 hi = tab[r]/100 + ' ';
482 lo = tab[r]%100 + ' ';
483 J2S(hi, lo);
484 *p++ = hi;
485 *p++ = lo;
486 continue;
488 if(squawk)
489 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
490 nerrors++;
491 if(clean)
492 continue;
493 *p++ = BYTEBADMAP;
496 noutput += p-obuf;
497 if(p > obuf)
498 write(1, obuf, p-obuf);
501 /* ujis, or EUC */
502 void
503 ujis_out(Rune *base, int n, long *notused)
505 char *p;
506 int i;
507 Rune r;
509 USED(notused);
510 if(first)
511 tab_init();
512 nrunes += n;
513 p = obuf;
514 for(i = 0; i < n; i++){
515 r = base[i];
516 if(r < 128)
517 *p++ = r;
518 else {
519 if(tab[r] != -1){
520 *p++ = 0x80 | (tab[r]/100 + ' ');
521 *p++ = 0x80 | (tab[r]%100 + ' ');
522 continue;
524 if(squawk)
525 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
526 nerrors++;
527 if(clean)
528 continue;
529 *p++ = BYTEBADMAP;
532 noutput += p-obuf;
533 if(p > obuf)
534 write(1, obuf, p-obuf);