Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #else
6 #include <stdio.h>
7 #include <unistd.h>
8 #include "plan9.h"
9 #endif
10 #include "hdr.h"
11 #include "conv.h"
12 #include "kuten208.h"
13 #include "jis.h"
15 /*
16 a state machine for interpreting all sorts of encodings
17 */
18 static void
19 alljis(int c, Rune **r, long input_loc)
20 {
21 static enum { state0, state1, state2, state3, state4 } state = state0;
22 static int set8 = 0;
23 static int japan646 = 0;
24 static int lastc;
25 int n;
26 long l;
28 again:
29 switch(state)
30 {
31 case state0: /* idle state */
32 if(c == ESC){ state = state1; return; }
33 if(c < 0) return;
34 if(!set8 && (c < 128)){
35 if(japan646){
36 switch(c)
37 {
38 case '\\': emit(0xA5); return; /* yen */
39 case '~': emit(0xAF); return; /* spacing macron */
40 default: emit(c); return;
41 }
42 } else {
43 emit(c);
44 return;
45 }
46 }
47 if(c < 0x21){ /* guard against bogus characters in JIS mode */
48 if(squawk)
49 EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
50 emit(c);
51 return;
52 }
53 lastc = c; state = state4; return;
55 case state1: /* seen an escape */
56 if(c == '$'){ state = state2; return; }
57 if(c == '('){ state = state3; return; }
58 emit(ESC); state = state0; goto again;
60 case state2: /* may be shifting into JIS */
61 if((c == '@') || (c == 'B')){
62 set8 = 1; state = state0; return;
63 }
64 emit(ESC); emit('$'); state = state0; goto again;
66 case state3: /* may be shifting out of JIS */
67 if((c == 'J') || (c == 'H') || (c == 'B')){
68 japan646 = (c == 'J');
69 set8 = 0; state = state0; return;
70 }
71 emit(ESC); emit('('); state = state0; goto again;
73 case state4: /* two part char */
74 if(c < 0){
75 if(squawk)
76 EPR "%s: unexpected EOF in %s\n", argv0, file);
77 c = 0x21 | (lastc&0x80);
78 }
79 if(CANS2J(lastc, c)){ /* ms dos sjis */
80 int hi = lastc, lo = c;
81 S2J(hi, lo); /* convert to 208 */
82 n = hi*100 + lo - 3232; /* convert to kuten208 */
83 } else
84 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
85 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
86 nerrors++;
87 if(squawk)
88 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
89 if(!clean)
90 emit(BADMAP);
91 } else {
92 if(l < 0){
93 l = -l;
94 if(squawk)
95 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
96 }
97 emit(l);
98 }
99 state = state0;
103 /*
104 a state machine for interpreting ms-kanji == shift-jis.
105 */
106 static void
107 ms(int c, Rune **r, long input_loc)
109 static enum { state0, state1, state2, state3, state4 } state = state0;
110 static int set8 = 0;
111 static int japan646 = 0;
112 static int lastc;
113 int n;
114 long l;
116 again:
117 switch(state)
119 case state0: /* idle state */
120 if(c == ESC){ state = state1; return; }
121 if(c < 0) return;
122 if(!set8 && (c < 128)){
123 if(japan646){
124 switch(c)
126 case '\\': emit(0xA5); return; /* yen */
127 case '~': emit(0xAF); return; /* spacing macron */
128 default: emit(c); return;
130 } else {
131 emit(c);
132 return;
135 lastc = c; state = state4; return;
137 case state1: /* seen an escape */
138 if(c == '$'){ state = state2; return; }
139 if(c == '('){ state = state3; return; }
140 emit(ESC); state = state0; goto again;
142 case state2: /* may be shifting into JIS */
143 if((c == '@') || (c == 'B')){
144 set8 = 1; state = state0; return;
146 emit(ESC); emit('$'); state = state0; goto again;
148 case state3: /* may be shifting out of JIS */
149 if((c == 'J') || (c == 'H') || (c == 'B')){
150 japan646 = (c == 'J');
151 set8 = 0; state = state0; return;
153 emit(ESC); emit('('); state = state0; goto again;
155 case state4: /* two part char */
156 if(c < 0){
157 if(squawk)
158 EPR "%s: unexpected EOF in %s\n", argv0, file);
159 c = 0x21 | (lastc&0x80);
161 if(CANS2J(lastc, c)){ /* ms dos sjis */
162 int hi = lastc, lo = c;
163 S2J(hi, lo); /* convert to 208 */
164 n = hi*100 + lo - 3232; /* convert to kuten208 */
165 } else {
166 nerrors++;
167 if(squawk)
168 EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
169 if(!clean)
170 emit(BADMAP);
171 state = state0;
172 goto again;
174 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
175 nerrors++;
176 if(squawk)
177 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
178 if(!clean)
179 emit(BADMAP);
180 } else {
181 if(l < 0){
182 l = -l;
183 if(squawk)
184 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
186 emit(l);
188 state = state0;
192 /*
193 a state machine for interpreting ujis == EUC
194 */
195 static void
196 ujis(int c, Rune **r, long input_loc)
198 static enum { state0, state1 } state = state0;
199 static int lastc;
200 int n;
201 long l;
203 switch(state)
205 case state0: /* idle state */
206 if(c < 0) return;
207 if(c < 128){
208 emit(c);
209 return;
211 if(c == 0x8e){ /* codeset 2 */
212 nerrors++;
213 if(squawk)
214 EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
215 if(!clean)
216 emit(BADMAP);
217 return;
219 if(c == 0x8f){ /* codeset 3 */
220 nerrors++;
221 if(squawk)
222 EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
223 if(!clean)
224 emit(BADMAP);
225 return;
227 lastc = c;
228 state = state1;
229 return;
231 case state1: /* two part char */
232 if(c < 0){
233 if(squawk)
234 EPR "%s: unexpected EOF in %s\n", argv0, file);
235 c = 0xA1;
237 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */
238 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
239 nerrors++;
240 if(squawk)
241 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
242 if(!clean)
243 emit(BADMAP);
244 } else {
245 if(l < 0){
246 l = -l;
247 if(squawk)
248 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
250 emit(l);
252 state = state0;
256 /*
257 a state machine for interpreting jis-kanji == 2022-JP
258 */
259 static void
260 jis(int c, Rune **r, long input_loc)
262 static enum { state0, state1, state2, state3, state4 } state = state0;
263 static int set8 = 0;
264 static int japan646 = 0;
265 static int lastc;
266 int n;
267 long l;
269 again:
270 switch(state)
272 case state0: /* idle state */
273 if(c == ESC){ state = state1; return; }
274 if(c < 0) return;
275 if(!set8 && (c < 128)){
276 if(japan646){
277 switch(c)
279 case '\\': emit(0xA5); return; /* yen */
280 case '~': emit(0xAF); return; /* spacing macron */
281 default: emit(c); return;
283 } else {
284 emit(c);
285 return;
288 lastc = c; state = state4; return;
290 case state1: /* seen an escape */
291 if(c == '$'){ state = state2; return; }
292 if(c == '('){ state = state3; return; }
293 emit(ESC); state = state0; goto again;
295 case state2: /* may be shifting into JIS */
296 if((c == '@') || (c == 'B')){
297 set8 = 1; state = state0; return;
299 emit(ESC); emit('$'); state = state0; goto again;
301 case state3: /* may be shifting out of JIS */
302 if((c == 'J') || (c == 'H') || (c == 'B')){
303 japan646 = (c == 'J');
304 set8 = 0; state = state0; return;
306 emit(ESC); emit('('); state = state0; goto again;
308 case state4: /* two part char */
309 if(c < 0){
310 if(squawk)
311 EPR "%s: unexpected EOF in %s\n", argv0, file);
312 c = 0x21 | (lastc&0x80);
314 if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
315 emit(lastc);
316 state = state0;
317 goto again;
319 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */
320 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
321 nerrors++;
322 if(squawk)
323 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
324 if(!clean)
325 emit(BADMAP);
326 } else {
327 if(l < 0){
328 l = -l;
329 if(squawk)
330 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
332 emit(l);
334 state = state0;
338 static void
339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
341 Rune ob[N];
342 Rune *r, *re;
343 uchar ibuf[N];
344 int n, i;
345 long nin;
347 r = ob;
348 re = ob+N-3;
349 nin = 0;
350 while((n = read(fd, ibuf, sizeof ibuf)) > 0){
351 for(i = 0; i < n; i++){
352 (*procfn)(ibuf[i], &r, nin++);
353 if(r >= re){
354 OUT(out, ob, r-ob);
355 r = ob;
358 if(r > ob){
359 OUT(out, ob, r-ob);
360 r = ob;
363 (*procfn)(-1, &r, nin);
364 if(r > ob)
365 OUT(out, ob, r-ob);
366 OUT(out, ob, 0);
369 void
370 jis_in(int fd, long *notused, struct convert *out)
372 USED(notused);
373 do_in(fd, alljis, out);
376 void
377 ujis_in(int fd, long *notused, struct convert *out)
379 USED(notused);
380 do_in(fd, ujis, out);
383 void
384 msjis_in(int fd, long *notused, struct convert *out)
386 USED(notused);
387 do_in(fd, ms, out);
390 void
391 jisjis_in(int fd, long *notused, struct convert *out)
393 USED(notused);
394 do_in(fd, jis, out);
397 static int first = 1;
399 static void
400 tab_init(void)
402 int i;
403 long l;
405 first = 0;
406 for(i = 0; i < NRUNE; i++)
407 tab[i] = -1;
408 for(i = 0; i < KUTEN208MAX; i++)
409 if((l = tabkuten208[i]) != -1){
410 if(l < 0)
411 tab[-l] = i;
412 else
413 tab[l] = i;
418 /* jis-kanji, or ISO 2022-JP */
419 void
420 jisjis_out(Rune *base, int n, long *notused)
422 char *p;
423 int i;
424 Rune r;
425 static enum { ascii, japan646, jp2022 } state = ascii;
427 USED(notused);
428 if(first)
429 tab_init();
430 nrunes += n;
431 p = obuf;
432 for(i = 0; i < n; i++){
433 r = base[i];
434 if(r < 128){
435 if(state == jp2022){
436 *p++ = ESC; *p++ = '('; *p++ = 'B';
437 state = ascii;
439 *p++ = r;
440 } else {
441 if(tab[r] != -1){
442 if(state != jp2022){
443 *p++ = ESC; *p++ = '$'; *p++ = 'B';
444 state = jp2022;
446 *p++ = tab[r]/100 + ' ';
447 *p++ = tab[r]%100 + ' ';
448 continue;
450 if(squawk)
451 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
452 nerrors++;
453 if(clean)
454 continue;
455 *p++ = BYTEBADMAP;
458 noutput += p-obuf;
459 if(p > obuf)
460 write(1, obuf, p-obuf);
463 /* ms-kanji, or Shift-JIS */
464 void
465 msjis_out(Rune *base, int n, long *notused)
467 char *p;
468 int i, hi, lo;
469 Rune r;
471 USED(notused);
472 if(first)
473 tab_init();
474 nrunes += n;
475 p = obuf;
476 for(i = 0; i < n; i++){
477 r = base[i];
478 if(r < 128)
479 *p++ = r;
480 else {
481 if(tab[r] != -1){
482 hi = tab[r]/100 + ' ';
483 lo = tab[r]%100 + ' ';
484 J2S(hi, lo);
485 *p++ = hi;
486 *p++ = lo;
487 continue;
489 if(squawk)
490 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
491 nerrors++;
492 if(clean)
493 continue;
494 *p++ = BYTEBADMAP;
497 noutput += p-obuf;
498 if(p > obuf)
499 write(1, obuf, p-obuf);
502 /* ujis, or EUC */
503 void
504 ujis_out(Rune *base, int n, long *notused)
506 char *p;
507 int i;
508 Rune r;
510 USED(notused);
511 if(first)
512 tab_init();
513 nrunes += n;
514 p = obuf;
515 for(i = 0; i < n; i++){
516 r = base[i];
517 if(r < 128)
518 *p++ = r;
519 else {
520 if(tab[r] != -1){
521 *p++ = 0x80 | (tab[r]/100 + ' ');
522 *p++ = 0x80 | (tab[r]%100 + ' ');
523 continue;
525 if(squawk)
526 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
527 nerrors++;
528 if(clean)
529 continue;
530 *p++ = BYTEBADMAP;
533 noutput += p-obuf;
534 if(p > obuf)
535 write(1, obuf, p-obuf);