DOS ain't dead

UnHTML (Announce)

posted by jadoxa , Queensland, Australia, 16.07.2024, 04:54

> I found that the unhtml problem was the result of a buffer overflow.

Nice find. That also led me to the newline problem - it never sees </pre because the newline is literal, thus the tag is "pre\n". Here's my complete diff.

--- ../unhtml/unhtml.c 1996-02-18 07:06:06 +1000 +++ unhtml.c 2024-07-16 12:43:58 +1000 @@ -20,8 +20,8 @@ typedef struct { char in[7]; - char out1d; /* DOS character (USA codepage) */ - char out1w; /* Windows character */ + unsigned char out1d; /* DOS character (USA codepage) */ + unsigned char out1w; /* Windows character */ char out2[4]; /* ASCII substitute */ char use2; /* 1- use out2 instead of out1d for dos2flag 2- diacritical marked character @@ -58,7 +58,8 @@ {"#167", 21, 167, "%"}, {"uml", '"', 168, "\""}, {"#168", '"', 168, "\""}, - {"cright", 'C', 169, "(C)",1}, + {"COPY", 'C', 169, "(C)",1}, + {"copy", 'C', 169, "(C)",1}, {"#169", 'C', 169, "(C)",1}, {"ordf", 166, 170, "a"}, {"#170", 166, 170, "a"}, @@ -173,6 +174,8 @@ {{0},0,0,{0}} }; +/* the longest name above */ +#define MAX_SUB 6 void newline(void) { @@ -208,13 +211,24 @@ } void mygetchar(void) { + int space = 0; for (;;) { ch = getchar(); - if (ch == '\n' && !quoting) ch = ' '; /* convert to whitespace */ if (ch == EOF) { cnewline(); exit(0); } + if (!quoting) { + if (ch == '\n' || ch == '\t') ch = ' '; /* convert to whitespace */ + if (ch == ' ') { + space = 1; /* consolidate multiple spaces */ + continue; + } + if (space) { + ungetc(ch, stdin); + ch = ' '; + } + } return; } } @@ -253,7 +267,8 @@ void main(int argc, char **argv) { int notflag=0, intitle=0; - char cmdbuf[20]; + #define CMDBUF_SIZE 32 + char cmdbuf[CMDBUF_SIZE]; int listlevel = -1; /* not in a list */ int listcount[10]; /* current counter value at each list level */ int i; @@ -296,30 +311,37 @@ /* special character processing */ mygetchar(); i=0; - while (ch != ';' && i < 12) { + while (ch != ';' && !isspace(ch) && i < CMDBUF_SIZE - 1) { cmdbuf[i++] = ch; mygetchar(); } + if (intitle) continue; cmdbuf[i] = 0; - if (i > 10) { - /* bad &; field, should not occur, but I've seen them! */ - if (!intitle) { - printf("&%s%c", cmdbuf, ch); + if (*cmdbuf == '#') { + if (cmdbuf[1] == 'x') { + i = (int)strtol(cmdbuf + 2, 0, 16); + } else { + i = (int)strtol(cmdbuf + 1, 0, 10); + } + if (i < 128) { + putchar(i); startline = 0; + continue; } - continue; } - i = 0; - while (a[i].in) { - if (strcmp(a[i].in,cmdbuf)==0) { - if (!intitle) { + if (i <= MAX_SUB) { + i = 0; + while (*a[i].in) { + if (strcmp(a[i].in,cmdbuf)==0) { putTableChar(i); - startline = 0; + i = 0; + break; } - break; + i++; } - i++; } + if (i) printf("&%s%c", cmdbuf, ch); + startline = 0; continue; } /* process <> command */ @@ -330,7 +352,7 @@ mygetchar(); } i=0; - while (ch != ' ' && ch != '>') { + while (!isspace(ch) && ch != '>' && i < CMDBUF_SIZE - 1) { cmdbuf[i++] = ch; mygetchar(); } @@ -391,7 +413,9 @@ } if (strcmp("pre", cmdbuf)==0) { /* preformatted */ - if (!notflag) cnewline(); + cnewline(); + newline(); + if (notflag) skipws = 1; quoting = !notflag; continue; } @@ -534,4 +558,4 @@ continue; } } -} \ No newline at end of file +}

I've used unsigned char simply because VC6 complains about int truncation.
Consolidate multiple spaces (outside pre) to one space.
Convert &#N; and &#xN; to a character when N is under 128.
Replace entity cright with COPY & copy.
Allow entities to be stopped by space, should the semicolon be absent.
Preserve all entities that aren't matched.
Add additional newlines around pre.

Post reply

Complete thread:

webdump 2024-05-23 - bencollver, 27.06.2024, 00:59 (Announce)
- webdump 2024-05-23 - mbbrutman, 27.06.2024, 16:56
  - UnHTML - Rugxulo, 27.06.2024, 20:01
    - UnHTML - bencollver, 28.06.2024, 04:34
      - UnHTML - jadoxa, 28.06.2024, 08:43
        UnHTML - bencollver, 30.06.2024, 16:07
        UnHTML - jadoxa, 01.07.2024, 02:05
        UnHTML - bencollver, 14.07.2024, 01:30
        UnHTML - jadoxa, 14.07.2024, 03:22
        UnHTML - bencollver, 14.07.2024, 05:16
        DJGPP 2.03p2 (June 2002) - Rugxulo, 14.07.2024, 08:11
        UnHTML - bencollver, 15.07.2024, 02:14
        UnHTML - jadoxa, 16.07.2024, 04:54
        UnHTML - bencollver, 16.07.2024, 16:12
  - webdump 2024-05-23 - bencollver, 28.06.2024, 04:26
    - webdump 2024-05-23 - mbbrutman, 28.06.2024, 05:39
      - webdump 2024-05-23 - bencollver, 28.06.2024, 17:09
- webdump 2024-05-23 - bocke, 30.06.2024, 00:13
- webdump 2024-05-23 - jadoxa, 30.06.2024, 03:00
  - webdump 2024-05-23 - bencollver, 30.06.2024, 16:08