--- unhtml.bak 2024-07-14 17:22:49.000000000 -0700 +++ unhtml.c 2024-07-16 07:02:48.719828785 -0700 @@ -20,8 +20,8 @@ typedef struct { char in[7]; - char out1d; /* DOS character (USA codepage) */ - char out1w; /* Windows character */ + unsigned char out1d; /* DOS character (USA codepage) */ + unsigned char out1w; /* Windows character */ char out2[4]; /* ASCII substitute */ char use2; /* 1- use out2 instead of out1d for dos2flag 2- diacritical marked character @@ -58,7 +58,8 @@ {"#167", 21, 167, "%"}, {"uml", '"', 168, "\""}, {"#168", '"', 168, "\""}, - {"cright", 'C', 169, "(C)",1}, + {"COPY", 'C', 169, "(C)",1}, + {"copy", 'C', 169, "(C)",1}, {"#169", 'C', 169, "(C)",1}, {"ordf", 166, 170, "a"}, {"#170", 166, 170, "a"}, @@ -173,6 +174,8 @@ {{0},0,0,{0}} }; +/* the longest name above */ +#define MAX_SUB 6 void newline(void) { @@ -208,13 +211,24 @@ } void mygetchar(void) { + int space = 0; for (;;) { ch = getchar(); - if (ch == '\n' && !quoting) ch = ' '; /* convert to whitespace */ if (ch == EOF) { cnewline(); exit(0); } + if (!quoting) { + if (ch == '\n' || ch == '\t') ch = ' '; /* convert to whitespace */ + if (ch == ' ') { + space = 1; /* consolidate multiple spaces */ + continue; + } + if (space) { + ungetc(ch, stdin); + ch = ' '; + } + } return; } } @@ -253,13 +267,14 @@ void main(int argc, char **argv) { int notflag=0, intitle=0; - char cmdbuf[20]; + #define CMDBUF_SIZE 32 + char cmdbuf[CMDBUF_SIZE]; int listlevel = -1; /* not in a list */ int listcount[10]; /* current counter value at each list level */ int i; char *arglist; - fprintf(stderr, "HTML removing filter Version 1.0\n" + fprintf(stderr, "HTML removing filter Version 1.0c\n" "Copyright 1996 by Tom Almy\n"); if (argc > 2) usage(); @@ -296,30 +311,37 @@ /* special character processing */ mygetchar(); i=0; - while (ch != ';' && i < 12) { + while (ch != ';' && i < CMDBUF_SIZE - 1) { cmdbuf[i++] = ch; mygetchar(); } + if (intitle) continue; cmdbuf[i] = 0; - if (i > 10) { - /* bad &; field, should not occur, but I've seen them! */ - if (!intitle) { - printf("&%s%c", cmdbuf, ch); + if (*cmdbuf == '#') { + if (cmdbuf[1] == 'x') { + i = (int)strtol(cmdbuf + 2, 0, 16); + } else { + i = (int)strtol(cmdbuf + 1, 0, 10); + } + if (i < 128) { + putchar(i); startline = 0; + continue; } - continue; } - i = 0; - while (a[i].in) { - if (strcmp(a[i].in,cmdbuf)==0) { - if (!intitle) { + if (i <= MAX_SUB) { + i = 0; + while (*a[i].in) { + if (strcmp(a[i].in,cmdbuf)==0) { putTableChar(i); - startline = 0; + i = 0; + break; } - break; + i++; } - i++; } + if (i) printf("&%s%c", cmdbuf, ch); + startline = 0; continue; } /* process <> command */ @@ -330,7 +352,7 @@ mygetchar(); } i=0; - while (ch != ' ' && ch != '>') { + while (!isspace(ch) && ch != '>' && i < CMDBUF_SIZE - 1) { cmdbuf[i++] = ch; mygetchar(); } @@ -391,7 +413,9 @@ } if (strcmp("pre", cmdbuf)==0) { /* preformatted */ - if (!notflag) cnewline(); + cnewline(); + newline(); + if (notflag) skipws = 1; quoting = !notflag; continue; }