commit - 01a1c31a7d99fd24ba134ddc33fb3df95a668f3a
commit + 431e32de9b41c230b0791fb9f2f293859d189e59
blob - 81391f96923142bba020e315db4f5ef56e60d2b8
blob + cc368ac2c5e367c1de93b60c0e00a807ea8581fc
--- src/libhtml/lex.c
+++ src/libhtml/lex.c
{"kappa", 954},
{"lambda", 955},
{"laquo", 171},
+ {"ldquo", 8220},
{"ldots", 8230},
+ {"lsquo", 8216},
{"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"quad", 8193},
{"quot", 34},
{"raquo", 187},
+ {"rdquo", 8221},
{"reg", 174},
{"rho", 961},
+ {"rsquo", 8217},
{"sect", 167},
{"shy", 173},
{"sigma", 963},
ai = 0;
if(dbglex)
fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
- if(ts->mtype == TextHtml) {
- for(;;) {
- if(ai == alen) {
+ if(ts->mtype == TextHtml){
+ for(;;){
+ if(ai == alen){
a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
alen += ToksChunk;
}
c = getchar(ts);
if(c < 0)
break;
- if(c == '<') {
+ if(c == '<'){
tag = gettag(ts, starti, a, &ai);
- if(tag == Tscript) {
+ if(tag == Tscript){
// special rules for getting Data after....
starti = ts->i;
c = getchar(ts);
}
else {
// plain text (non-html) tokens
- for(;;) {
- if(ai == alen) {
+ for(;;){
+ if(ai == alen){
a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
alen += ToksChunk;
}
s = nil;
j = 0;
starti = ts->i;
- for(c = getchar(ts); c >= 0; c = getchar(ts)) {
- if(c < ' ') {
- if(isspace(c)) {
- if(c == '\r') {
+ for(c = getchar(ts); c >= 0; c = getchar(ts)){
+ if(c < ' '){
+ if(isspace(c)){
+ if(c == '\r'){
// ignore it unless no following '\n',
// in which case treat it like '\n'
c = getchar(ts);
- if(c != '\n') {
+ if(c != '\n'){
if(c >= 0)
ungetchar(ts, c);
c = '\n';
else
c = 0;
}
- if(c != 0) {
+ if(c != 0){
buf[j++] = c;
- if(j == sizeof(buf)-1) {
+ if(j == sizeof(buf)-1){
s = buftostr(s, buf, j);
j = 0;
}
s = nil;
j = 0;
c = firstc;
- while(c >= 0) {
- if(c == '&') {
+ while(c >= 0){
+ if(c == '&'){
c = ampersand(ts);
if(c < 0)
break;
}
- else if(c < ' ') {
- if(isspace(c)) {
- if(c == '\r') {
+ else if(c < ' '){
+ if(isspace(c)){
+ if(c == '\r'){
// ignore it unless no following '\n',
// in which case treat it like '\n'
c = getchar(ts);
- if(c != '\n') {
+ if(c != '\n'){
if(c >= 0)
ungetchar(ts, c);
c = '\n';
c = 0;
}
}
- else if(c == '<') {
+ else if(c == '<'){
ungetchar(ts, c);
break;
}
- if(c != 0) {
+ if(c != 0){
buf[j++] = c;
- if(j == BIGBUFSIZE-1) {
+ if(j == BIGBUFSIZE-1){
s = buftostr(s, buf, j);
j = 0;
}
tstarti = starti;
c = firstc;
done = 0;
- while(c >= 0) {
- if(c == '<') {
+ while(c >= 0){
+ if(c == '<'){
// other browsers ignore stuff to end of line after <!
savei = ts->i;
c = getchar(ts);
- if(c == '!') {
+ if(c == '!'){
while(c >= 0 && c != '\n' && c != '\r')
c = getchar(ts);
if(c == '\r')
if(c == '\n')
c = getchar(ts);
}
- else if(c >= 0) {
+ else if(c >= 0){
backup(ts, savei);
tag = gettag(ts, tstarti, a, pai);
if(tag == -1)
if(tag != Comment)
(*pai)--;
backup(ts, tstarti);
- if(tag == Tscript + RBRA) {
+ if(tag == Tscript + RBRA){
done = 1;
break;
}
}
if(c < 0)
break;
- if(c != 0) {
+ if(c != 0){
buf[j++] = c;
- if(j == BIGBUFSIZE-1) {
+ if(j == BIGBUFSIZE-1){
s = buftostr(s, buf, j);
j = 0;
}
tstarti = ts->i;
c = getchar(ts);
}
- if(done || ts->i == ts->edata) {
+ if(done || ts->i == ts->edata){
s = buftostr(s, buf, j);
tok = &a[(*pai)++];
tok->tag = Data;
tok->attr = nil;
tok->starti = starti;
c = getchar(ts);
- if(c == '/') {
+ if(c == '/'){
rbra = RBRA;
c = getchar(ts);
}
if(c < 0)
goto eob_done;
- if(c >= 256 || !isalpha(c)) {
+ if(c >= 256 || !isalpha(c)){
// not a tag
- if(c == '!') {
+ if(c == '!'){
ans = comment(ts);
if(ans != -1)
return ans;
// c starts a tagname
buf[0] = c;
i = 1;
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
// attribute gathering loop
al = nil;
- while(1) {
+ for(;;){
// look for "ws name" or "ws name ws = ws val" (ws=whitespace)
// skip whitespace
attrloop_continue:
- while(c < 256 && isspace(c)) {
+ while(c < 256 && isspace(c)){
c = getchar(ts);
if(c < 0)
goto eob_done;
}
if(c == '>')
goto attrloop_done;
- if(c == '<') {
+ if(c == '<'){
if(warn)
fprint(2, "warning: unclosed tag\n");
ungetchar(ts, c);
goto attrloop_done;
}
- if(c >= 256 || !isalpha(c)) {
+ if(c >= 256 || !isalpha(c)){
if(warn)
fprint(2, "warning: expected attribute name\n");
// skipt to next attribute name
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(c < 256 && isalpha(c))
goto attrloop_continue;
- if(c == '<') {
+ if(c == '<'){
if(warn)
fprint(2, "warning: unclosed tag\n");
ungetchar(ts, 60);
// gather attribute name
buf[0] = c;
i = 1;
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
buf[i++] = c;
}
afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
- if(warn && !afnd) {
+ if(warn && !afnd){
buf[i] = 0;
fprint(2, "warning: unknown attribute name %S\n", buf);
}
// skip whitespace
- while(c < 256 && isspace(c)) {
+ while(c < 256 && isspace(c)){
c = getchar(ts);
if(c < 0)
goto eob_done;
}
- if(c != '=') {
+ if(c != '='){
if(afnd)
al = newattr(attid, nil, al);
goto attrloop_continue;
}
//# c is '=' here; skip whitespace
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
break;
}
quote = 0;
- if(c == '\'' || c == '"') {
+ if(c == '\'' || c == '"'){
quote = c;
c = getchar(ts);
if(c < 0)
}
val = nil;
nv = 0;
- while(1) {
+ for(;;){
valloop_continue:
if(c < 0)
goto eob_done;
- if(c == '>') {
- if(quote) {
+ if(c == '>'){
+ if(quote){
// c might be part of string (though not good style)
// but if line ends before close quote, assume
// there was an unmatched quote
ti = ts->i;
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
- if(c == quote) {
+ if(c == quote){
backup(ts, ti);
buf[nv++] = '>';
- if(nv == BIGBUFSIZE-1) {
+ if(nv == BIGBUFSIZE-1){
val = buftostr(val, buf, nv);
nv = 0;
}
c = getchar(ts);
goto valloop_continue;
}
- if(c == '\n') {
+ if(c == '\n'){
if(warn)
fprint(2, "warning: apparent unmatched quote\n");
backup(ts, ti);
else
goto valloop_done;
}
- if(quote) {
- if(c == quote) {
+ if(quote){
+ if(c == quote){
c = getchar(ts);
if(c < 0)
goto eob_done;
goto valloop_done;
}
- if(c == '\r') {
+ if(c == '\r'){
c = getchar(ts);
goto valloop_continue;
}
if(c < 256 && isspace(c))
goto valloop_done;
}
- if(c == '&') {
+ if(c == '&'){
c = ampersand(ts);
if(c == -1)
goto eob_done;
}
buf[nv++] = c;
- if(nv == BIGBUFSIZE-1) {
+ if(nv == BIGBUFSIZE-1){
val = buftostr(val, buf, nv);
nv = 0;
}
c = getchar(ts);
}
valloop_done:
- if(afnd) {
+ if(afnd){
val = buftostr(val, buf, nv);
al = newattr(attid, val, al);
}
nexti = ts->i;
havecomment = 0;
c = getchar(ts);
- if(c == '-') {
+ if(c == '-'){
c = getchar(ts);
- if(c == '-') {
+ if(c == '-'){
if(findstr(ts, L(Larrow)))
havecomment = 1;
else
backup(ts, nexti);
}
}
- if(!havecomment) {
+ if(!havecomment){
if(c == '>')
havecomment = 1;
- else if(c >= 0) {
+ else if(c >= 0){
if(findstr(ts, L(Lgt)))
havecomment = 1;
}
c0 = s[0];
n = runestrlen(s);
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
break;
- if(c == c0) {
+ if(c == c0){
if(n == 1)
return 1;
nexti = ts->i;
- for(i = 1; i < n; i++) {
+ for(i = 1; i < n; i++){
c = getchar(ts);
if(c < 0)
goto mainloop_done;
return 0;
}
+static int
+xdigit(int c)
+{
+ if('0' <= c && c <= '9')
+ return c-'0';
+ if('a' <= c && c <= 'f')
+ return c-'a'+10;
+ if('A' <= c && c <= 'F')
+ return c-'A'+10;
+ return -1;
+}
+
// We've just read an '&'; look for an entity reference
// name, and if found, return translated char.
// if there is a complete entity name but it isn't known,
c = getchar(ts);
fnd = 0;
ans = -1;
- if(c == '#') {
+ if(c == '#'){
c = getchar(ts);
v = 0;
- while(c >= 0) {
- if(!(c < 256 && isdigit(c)))
- break;
- v = v*10 + c - 48;
+ if(c == 'x'){
c = getchar(ts);
+ while((i=xdigit(c)) != -1){
+ v = v*16 + i;
+ c = getchar(ts);
+ }
+ }else{
+ while('0' <= c && c <= '9'){
+ v = v*10 + c - '0';
+ c = getchar(ts);
+ }
}
- if(c >= 0) {
+ if(c >= 0){
if(!(c == ';' || c == '\n' || c == '\r'))
ungetchar(ts, c);
c = v;
if(c == 160)
c = 160;
- if(c >= Winstart && c <= Winend) {
+ if(c >= Winstart && c <= Winend){
c = winchars[c - Winstart];
}
ans = c;
fnd = 1;
}
}
- else if(c < 256 && isalpha(c)) {
+ else if(c < 256 && isalpha(c)){
buf[0] = c;
k = 1;
- while(1) {
+ for(;;){
c = getchar(ts);
if(c < 0)
break;
- if(ISNAMCHAR(c)) {
+ if(ISNAMCHAR(c)){
if(k < SMALLBUFSIZE-1)
buf[k++] = c;
}
break;
}
}
- if(c >= 0) {
+ if(c >= 0){
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
- if(!fnd) {
+ if(!fnd){
// Try prefixes of s
if(c == ';' || c == '\n' || c == '\r')
ungetchar(ts, c);
i = k;
- while(--k > 0) {
+ while(--k > 0){
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
- if(fnd) {
- while(i > k) {
+ if(fnd){
+ while(i > k){
i--;
ungetchar(ts, buf[i]);
}
}
}
}
- if(!fnd) {
+ if(!fnd){
backup(ts, savei);
ans = '&';
}
return -1;
buf = ts->data;
c = buf[ts->i];
- switch(ts->chset) {
+ switch(ts->chset){
case ISO_8859_1:
if(c >= Winstart && c <= Winend)
c = winchars[c - Winstart];
ts->i++;
break;
case US_Ascii:
- if(c > 127) {
+ if(c > 127){
if(warn)
fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
}
case UTF_8:
ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
n = chartorune(&r, (char*)(buf+ts->i));
- if(ok) {
+ if(ok){
if(warn && c == 0x80)
fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
ts->i += n;
}
break;
case Unicode:
- if(ts->i < ts->edata - 1) {
+ if(ts->i < ts->edata - 1){
//standards say most-significant byte first
c = (c << 8)|(buf[ts->i + 1]);
ts->i += 2;
char a[UTFmax];
n = 1;
- switch(ts->chset) {
+ switch(ts->chset){
case UTF_8:
- if(c >= 128) {
+ if(c >= 128){
r = c;
n = runetochar(a, &r);
}
Attr* attr;
attr = t->attr;
- while(attr != nil) {
- if(attr->attid == attid) {
+ while(attr != nil){
+ if(attr->attid == attid){
if(pans != nil)
*pans = attr->value;
if(xfer)
if(dbglex > 1)
i = snprint(buf, sizeof(buf), "[%d]", t->starti);
tag = t->tag;
- if(tag == Data) {
+ if(tag == Data){
i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
}
else {
srbra = "";
- if(tag >= RBRA) {
+ if(tag >= RBRA){
tag -= RBRA;
srbra = "/";
}
if(tag == Notfound)
tname = L(Lquestion);
i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
- for(a = t->attr; a != nil; a = a->next) {
+ for(a = t->attr; a != nil; a = a->next){
aname = attrnames[a->attid];
i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
if(a->value != nil)
Attr* nexta;
a = ahead;
- while(a != nil) {
+ while(a != nil){
nexta = a->next;
free(a->value);
free(a);
if(tarray == nil)
return;
- for(i = 0; i < n; i++) {
+ for(i = 0; i < n; i++){
t = &tarray[i];
free(t->text);
freeattrs(t->attr);