UnHTML (Announce)
> I found that the unhtml problem was the result of a buffer overflow.
Nice find. That also led me to the newline problem - it never sees </pre because the newline is literal, thus the tag is "pre\n". Here's my complete diff.
--- ../unhtml/unhtml.c 1996-02-18 07:06:06 +1000
+++ unhtml.c 2024-07-16 12:43:58 +1000
@@ -20,8 +20,8 @@
typedef struct {
char in[7];
- char out1d; /* DOS character (USA codepage) */
- char out1w; /* Windows character */
+ unsigned char out1d; /* DOS character (USA codepage) */
+ unsigned char out1w; /* Windows character */
char out2[4]; /* ASCII substitute */
char use2; /* 1- use out2 instead of out1d for dos2flag
2- diacritical marked character
@@ -58,7 +58,8 @@
{"#167", 21, 167, "%"},
{"uml", '"', 168, "\""},
{"#168", '"', 168, "\""},
- {"cright", 'C', 169, "(C)",1},
+ {"COPY", 'C', 169, "(C)",1},
+ {"copy", 'C', 169, "(C)",1},
{"#169", 'C', 169, "(C)",1},
{"ordf", 166, 170, "a"},
{"#170", 166, 170, "a"},
@@ -173,6 +174,8 @@
{{0},0,0,{0}}
};
+/* the longest name above */
+#define MAX_SUB 6
void newline(void) {
@@ -208,13 +211,24 @@
}
void mygetchar(void) {
+ int space = 0;
for (;;) {
ch = getchar();
- if (ch == '\n' && !quoting) ch = ' '; /* convert to whitespace */
if (ch == EOF) {
cnewline();
exit(0);
}
+ if (!quoting) {
+ if (ch == '\n' || ch == '\t') ch = ' '; /* convert to whitespace */
+ if (ch == ' ') {
+ space = 1; /* consolidate multiple spaces */
+ continue;
+ }
+ if (space) {
+ ungetc(ch, stdin);
+ ch = ' ';
+ }
+ }
return;
}
}
@@ -253,7 +267,8 @@
void main(int argc, char **argv) {
int notflag=0, intitle=0;
- char cmdbuf[20];
+ #define CMDBUF_SIZE 32
+ char cmdbuf[CMDBUF_SIZE];
int listlevel = -1; /* not in a list */
int listcount[10]; /* current counter value at each list level */
int i;
@@ -296,30 +311,37 @@
/* special character processing */
mygetchar();
i=0;
- while (ch != ';' && i < 12) {
+ while (ch != ';' && !isspace(ch) && i < CMDBUF_SIZE - 1) {
cmdbuf[i++] = ch;
mygetchar();
}
+ if (intitle) continue;
cmdbuf[i] = 0;
- if (i > 10) {
- /* bad &; field, should not occur, but I've seen them! */
- if (!intitle) {
- printf("&%s%c", cmdbuf, ch);
+ if (*cmdbuf == '#') {
+ if (cmdbuf[1] == 'x') {
+ i = (int)strtol(cmdbuf + 2, 0, 16);
+ } else {
+ i = (int)strtol(cmdbuf + 1, 0, 10);
+ }
+ if (i < 128) {
+ putchar(i);
startline = 0;
+ continue;
}
- continue;
}
- i = 0;
- while (a[i].in) {
- if (strcmp(a[i].in,cmdbuf)==0) {
- if (!intitle) {
+ if (i <= MAX_SUB) {
+ i = 0;
+ while (*a[i].in) {
+ if (strcmp(a[i].in,cmdbuf)==0) {
putTableChar(i);
- startline = 0;
+ i = 0;
+ break;
}
- break;
+ i++;
}
- i++;
}
+ if (i) printf("&%s%c", cmdbuf, ch);
+ startline = 0;
continue;
}
/* process <> command */
@@ -330,7 +352,7 @@
mygetchar();
}
i=0;
- while (ch != ' ' && ch != '>') {
+ while (!isspace(ch) && ch != '>' && i < CMDBUF_SIZE - 1) {
cmdbuf[i++] = ch;
mygetchar();
}
@@ -391,7 +413,9 @@
}
if (strcmp("pre", cmdbuf)==0) {
/* preformatted */
- if (!notflag) cnewline();
+ cnewline();
+ newline();
+ if (notflag) skipws = 1;
quoting = !notflag;
continue;
}
@@ -534,4 +558,4 @@
continue;
}
}
-}
\ No newline at end of file
+}
I've used unsigned char simply because VC6 complains about int truncation.
Consolidate multiple spaces (outside pre) to one space.
Convert &#N; and &#xN; to a character when N is under 128.
Replace entity cright with COPY & copy.
Allow entities to be stopped by space, should the semicolon be absent.
Preserve all entities that aren't matched.
Add additional newlines around pre.
Complete thread:
- webdump 2024-05-23 - bencollver, 27.06.2024, 00:59 (Announce)
- webdump 2024-05-23 - mbbrutman, 27.06.2024, 16:56
- UnHTML - Rugxulo, 27.06.2024, 20:01
- webdump 2024-05-23 - bencollver, 28.06.2024, 04:26
- webdump 2024-05-23 - mbbrutman, 28.06.2024, 05:39
- webdump 2024-05-23 - bencollver, 28.06.2024, 17:09
- webdump 2024-05-23 - mbbrutman, 28.06.2024, 05:39
- webdump 2024-05-23 - bocke, 30.06.2024, 00:13
- webdump 2024-05-23 - jadoxa, 30.06.2024, 03:00
- webdump 2024-05-23 - bencollver, 30.06.2024, 16:08
- webdump 2024-05-23 - mbbrutman, 27.06.2024, 16:56