/* ** Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM ** ========================================================================== */ #include <HTUtils.h> #include <SGML.h> #define Lynx_HTML_Handler #include <HTChunk.h> #include <HText.h> #include <HTStyle.h> #include <HTMIME.h> #include <HTML.h> #include <HTCJK.h> #include <HTAtom.h> #include <HTMLGen.h> #include <HTParse.h> #include <UCMap.h> #include <UCDefs.h> #include <UCAux.h> #include <LYGlobalDefs.h> #include <LYCharUtils.h> #include <LYCharSets.h> #include <HTAlert.h> #include <HTFont.h> #include <HTForms.h> #include <HTNestedList.h> #include <GridText.h> #include <LYStrings.h> #include <LYUtils.h> #include <LYMap.h> #include <LYBookmark.h> #include <LYCurses.h> #include <LYCookie.h> #include <LYexit.h> #include <LYLeaks.h> /* * next two used to get meta values into crawl file * - RDP */ extern char *meta_keywords; extern char *meta_description; extern BOOL HTPassEightBitRaw; extern BOOL HTPassEightBitNum; extern BOOL HTPassHighCtrlRaw; extern BOOL HTPassHighCtrlNum; extern HTkcode kanji_code; extern HTCJKlang HTCJK; /* * Used for nested lists. - FM */ PUBLIC int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */ PUBLIC int OL_VOID = -29998; /* flag for whether a count is set */ /* ** This function converts any ampersands in allocated ** strings to "&". If isTITLE is TRUE, it also ** converts any angle-brackets to "<" or ">". - FM */ PUBLIC void LYEntify ARGS2( char **, str, BOOLEAN, isTITLE) { char *p = *str; char *q = NULL, *cp = NULL; int amps = 0, lts = 0, gts = 0; if (p == NULL || *p == '\0') return; /* * Count the ampersands. - FM */ while ((*p != '\0') && (q = strchr(p, '&')) != NULL) { amps++; p = (q + 1); } /* * Count the left-angle-brackets, if needed. - FM */ if (isTITLE == TRUE) { p = *str; while ((*p != '\0') && (q = strchr(p, '<')) != NULL) { lts++; p = (q + 1); } } /* * Count the right-angle-brackets, if needed. - FM */ if (isTITLE == TRUE) { p = *str; while ((*p != '\0') && (q = strchr(p, '>')) != NULL) { gts++; p = (q + 1); } } /* * Check whether we need to convert anything. - FM */ if (amps == 0 && lts == 0 && gts == 0) return; /* * Allocate space and convert. - FM */ q = (char *)calloc(1, (strlen(*str) + (4 * amps) + (3 * lts) + (3 * gts) + 1)); if ((cp = q) == NULL) outofmem(__FILE__, "LYEntify"); for (p = *str; *p; p++) { if (*p == '&') { *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';'; } else if (isTITLE && *p == '<') { *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';'; } else if (isTITLE && *p == '>') { *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';'; } else { *q++ = *p; } } StrAllocCopy(*str, cp); FREE(cp); } /* ** This function trims characters <= that of a space (32), ** including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2), ** but not ESC, from the heads of strings. - FM */ PUBLIC void LYTrimHead ARGS1( char *, str) { int i = 0, j; if (!str || *str == '\0') return; while (str[i] != '\0' && WHITE(str[i]) && (unsigned char)str[i] != (unsigned char)CH_ESC) /* S/390 -- gil -- 1669 */ i++; if (i > 0) { for (j = 0; str[i] != '\0'; i++) { str[j++] = str[i]; } str[j] = '\0'; } } /* ** This function trims characters <= that of a space (32), ** including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and ** ESC from the tails of strings. - FM */ PUBLIC void LYTrimTail ARGS1( char *, str) { int i; if (!str || *str == '\0') return; i = (strlen(str) - 1); while (i >= 0) { if (WHITE(str[i])) str[i] = '\0'; else break; i--; } } /* ** This function should receive a pointer to the start ** of a comment. It returns a pointer to the end ('>') ** character of comment, or it's best guess if the comment ** is invalid. - FM */ PUBLIC char *LYFindEndOfComment ARGS1( char *, str) { char *cp, *cp1; enum comment_state { start1, start2, end1, end2 } state; if (str == NULL) /* * We got NULL, so return NULL. - FM */ return NULL; if (strncmp(str, "<!--", 4)) /* * We don't have the start of a comment, so * return the beginning of the string. - FM */ return str; cp = (str + 4); if (*cp =='>') /* * It's an invalid comment, so * return this end character. - FM */ return cp; if ((cp1 = strchr(cp, '>')) == NULL) /* * We don't have an end character, so * return the beginning of the string. - FM */ return str; if (*cp == '-') /* * Ugh, it's a "decorative" series of dashes, * so return the next end character. - FM */ return cp1; /* * OK, we're ready to start parsing. - FM */ state = start2; while (*cp != '\0') { switch (state) { case start1: if (*cp == '-') state = start2; else /* * Invalid comment, so return the first * '>' from the start of the string. - FM */ return cp1; break; case start2: if (*cp == '-') state = end1; break; case end1: if (*cp == '-') state = end2; else /* * Invalid comment, so return the first * '>' from the start of the string. - FM */ return cp1; break; case end2: if (*cp == '>') /* * Valid comment, so return the end character. - FM */ return cp; if (*cp == '-') { state = start1; } else if (!(WHITE(*cp) && (unsigned char)*cp != (unsigned char)CH_ESC)) { /* S/390 -- gil -- 1686 */ /* * Invalid comment, so return the first * '>' from the start of the string. - FM */ return cp1; } break; default: break; } cp++; } /* * Invalid comment, so return the first * '>' from the start of the string. - FM */ return cp1; } /* ** If an HREF, itself or if resolved against a base, ** represents a file URL, and the host is defaulted, ** force in "//localhost". We need this until ** all the other Lynx code which performs security ** checks based on the "localhost" string is changed ** to assume "//localhost" when a host field is not ** present in file URLs - FM */ PUBLIC void LYFillLocalFileURL ARGS2( char **, href, CONST char *, base) { char * temp = NULL; if (*href == NULL || *(*href) == '\0') return; if (!strcmp(*href, "//") || !strncmp(*href, "///", 3)) { if (base != NULL && !strncmp(base, "file:", 5)) { StrAllocCopy(temp, "file:"); StrAllocCat(temp, *href); StrAllocCopy(*href, temp); } } if (!strncmp(*href, "file:", 5)) { if (*(*href+5) == '\0') { StrAllocCat(*href, "//localhost"); } else if (!strcmp(*href, "file://")) { StrAllocCat(*href, "localhost"); } else if (!strncmp(*href, "file:///", 8)) { StrAllocCopy(temp, (*href+7)); LYLocalFileToURL (href, temp); } else if (!strncmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href+6))) { StrAllocCopy(temp, (*href+5)); LYLocalFileToURL (href, temp); } } #if defined(DOSPATH) || defined(__EMX__) if (isalpha(*(*href)) && (*(*href+1) == ':')) { /* * If it's a local DOS path beginning with drive letter, * add file://localhost/ prefix and go ahead. */ StrAllocCopy(temp, *href); LYLocalFileToURL (href, temp); } /* use below: strlen("file://localhost/") = 17 */ if (!strncmp(*href, "file://localhost/", 17) && (strlen(*href) == 19) && isalpha(*(*href+17)) && (*(*href+18) == ':')) { /* * Terminate DOS drive letter with a slash to surf root successfully. * Here seems a proper place to do so. */ StrAllocCat(*href, "/"); } #endif /* DOSPATH */ /* * No path in a file://localhost URL means a * directory listing for the current default. - FM */ if (!strcmp(*href, "file://localhost")) { char *temp2; #ifdef VMS temp2 = HTVMS_wwwName(getenv("PATH")); #else char curdir[LY_MAXPATH]; temp2 = wwwName(Current_Dir(curdir)); #endif /* VMS */ LYAddHtmlSep(href); /* * Check for pathological cases - current dir has chars which * MUST BE URL-escaped - kw */ if (strchr(temp2, '%') != NULL || strchr(temp2, '#') != NULL) { FREE(temp); temp = HTEscape(temp2, URL_PATH); StrAllocCat(*href, temp); } else { StrAllocCat(*href, temp2); } } #ifdef VMS /* * On VMS, a file://localhost/ URL means * a listing for the login directory. - FM */ if (!strcmp(*href, "file://localhost/")) StrAllocCat(*href, (HTVMS_wwwName((char *)Home_Dir())+1)); #endif /* VMS */ FREE(temp); return; } /* ** This function writes a line with a META tag to an open file, ** which will specify a charset parameter to use when the file is ** read back in. It is meant for temporary HTML files used by the ** various special pages which may show titles of documents. When those ** files are created, the title strings normally have been translated and ** expanded to the display character set, so we have to make sure they ** don't get translated again. ** If the user has changed the display character set during the lifetime ** of the Lynx session (or, more exactly, during the time the title ** strings to be written were generated), they may now have different ** character encodings and there is currently no way to get it all right. ** To change this, we would have to add a variable for each string which ** keeps track of its character encoding. ** But at least we can try to ensure that reading the file after future ** display character set changes will give reasonable output. ** ** The META tag is not written if the display character set (passed as ** disp_chndl) already corresponds to the charset assumption that ** would be made when the file is read. - KW ** ** Currently this function is used for temporary files like "Lynx Info Page" ** and for one permanent - bookmarks (so it may be a problem if you change ** the display charset later: new bookmark entries may be mistranslated). ** - LP */ PUBLIC void LYAddMETAcharsetToFD ARGS2( FILE *, fd, int, disp_chndl) { if (disp_chndl == -1) /* * -1 means use current_char_set. */ disp_chndl = current_char_set; if (fd == NULL || disp_chndl < 0) /* * Should not happen. */ return; if (UCLYhndl_HTFile_for_unspec == disp_chndl) /* * Not need to do, so we don't. */ return; if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT) /* * There shouldn't be any 8-bit characters in this case. */ return; /* * In other cases we don't know because UCLYhndl_for_unspec may * change during the lifetime of the file (by toggling raw mode * or changing the display character set), so proceed. */ fprintf(fd, "<META %s content=\"text/html;charset=%s\">\n", "http-equiv=\"content-type\"", LYCharSet_UC[disp_chndl].MIMEname); } /* ** This function returns OL TYPE="A" strings in ** the range of " A." (1) to "ZZZ." (18278). - FM */ PUBLIC char *LYUppercaseA_OL_String ARGS1( int, seqnum) { static char OLstring[8]; if (seqnum <= 1 ) { strcpy(OLstring, " A."); return OLstring; } if (seqnum < 27) { sprintf(OLstring, " %c.", (seqnum + 64)); return OLstring; } if (seqnum < 703) { sprintf(OLstring, "%c%c.", ((seqnum-1)/26 + 64), (seqnum - ((seqnum-1)/26)*26 + 64)); return OLstring; } if (seqnum < 18279) { sprintf(OLstring, "%c%c%c.", ((seqnum-27)/676 + 64), (((seqnum - ((seqnum-27)/676)*676)-1)/26 + 64), (seqnum - ((seqnum-1)/26)*26 + 64)); return OLstring; } strcpy(OLstring, "ZZZ."); return OLstring; } /* ** This function returns OL TYPE="a" strings in ** the range of " a." (1) to "zzz." (18278). - FM */ PUBLIC char *LYLowercaseA_OL_String ARGS1( int, seqnum) { static char OLstring[8]; if (seqnum <= 1 ) { strcpy(OLstring, " a."); return OLstring; } if (seqnum < 27) { sprintf(OLstring, " %c.", (seqnum + 96)); return OLstring; } if (seqnum < 703) { sprintf(OLstring, "%c%c.", ((seqnum-1)/26 + 96), (seqnum - ((seqnum-1)/26)*26 + 96)); return OLstring; } if (seqnum < 18279) { sprintf(OLstring, "%c%c%c.", ((seqnum-27)/676 + 96), (((seqnum - ((seqnum-27)/676)*676)-1)/26 + 96), (seqnum - ((seqnum-1)/26)*26 + 96)); return OLstring; } strcpy(OLstring, "zzz."); return OLstring; } /* ** This function returns OL TYPE="I" strings in the ** range of " I." (1) to "MMM." (3000).- FM */ PUBLIC char *LYUppercaseI_OL_String ARGS1( int, seqnum) { static char OLstring[8]; int Arabic = seqnum; if (Arabic >= 3000) { strcpy(OLstring, "MMM."); return OLstring; } switch(Arabic) { case 1: strcpy(OLstring, " I."); return OLstring; case 5: strcpy(OLstring, " V."); return OLstring; case 10: strcpy(OLstring, " X."); return OLstring; case 50: strcpy(OLstring, " L."); return OLstring; case 100: strcpy(OLstring, " C."); return OLstring; case 500: strcpy(OLstring, " D."); return OLstring; case 1000: strcpy(OLstring, " M."); return OLstring; default: OLstring[0] = '\0'; break; } while (Arabic >= 1000) { strcat(OLstring, "M"); Arabic -= 1000; } if (Arabic >= 900) { strcat(OLstring, "CM"); Arabic -= 900; } if (Arabic >= 500) { strcat(OLstring, "D"); Arabic -= 500; while (Arabic >= 500) { strcat(OLstring, "C"); Arabic -= 10; } } if (Arabic >= 400) { strcat(OLstring, "CD"); Arabic -= 400; } while (Arabic >= 100) { strcat(OLstring, "C"); Arabic -= 100; } if (Arabic >= 90) { strcat(OLstring, "XC"); Arabic -= 90; } if (Arabic >= 50) { strcat(OLstring, "L"); Arabic -= 50; while (Arabic >= 50) { strcat(OLstring, "X"); Arabic -= 10; } } if (Arabic >= 40) { strcat(OLstring, "XL"); Arabic -= 40; } while (Arabic > 10) { strcat(OLstring, "X"); Arabic -= 10; } switch (Arabic) { case 1: strcat(OLstring, "I."); break; case 2: strcat(OLstring, "II."); break; case 3: strcat(OLstring, "III."); break; case 4: strcat(OLstring, "IV."); break; case 5: strcat(OLstring, "V."); break; case 6: strcat(OLstring, "VI."); break; case 7: strcat(OLstring, "VII."); break; case 8: strcat(OLstring, "VIII."); break; case 9: strcat(OLstring, "IX."); break; case 10: strcat(OLstring, "X."); break; default: strcat(OLstring, "."); break; } return OLstring; } /* ** This function returns OL TYPE="i" strings in ** range of " i." (1) to "mmm." (3000).- FM */ PUBLIC char *LYLowercaseI_OL_String ARGS1( int, seqnum) { static char OLstring[8]; int Arabic = seqnum; if (Arabic >= 3000) { strcpy(OLstring, "mmm."); return OLstring; } switch(Arabic) { case 1: strcpy(OLstring, " i."); return OLstring; case 5: strcpy(OLstring, " v."); return OLstring; case 10: strcpy(OLstring, " x."); return OLstring; case 50: strcpy(OLstring, " l."); return OLstring; case 100: strcpy(OLstring, " c."); return OLstring; case 500: strcpy(OLstring, " d."); return OLstring; case 1000: strcpy(OLstring, " m."); return OLstring; default: OLstring[0] = '\0'; break; } while (Arabic >= 1000) { strcat(OLstring, "m"); Arabic -= 1000; } if (Arabic >= 900) { strcat(OLstring, "cm"); Arabic -= 900; } if (Arabic >= 500) { strcat(OLstring, "d"); Arabic -= 500; while (Arabic >= 500) { strcat(OLstring, "c"); Arabic -= 10; } } if (Arabic >= 400) { strcat(OLstring, "cd"); Arabic -= 400; } while (Arabic >= 100) { strcat(OLstring, "c"); Arabic -= 100; } if (Arabic >= 90) { strcat(OLstring, "xc"); Arabic -= 90; } if (Arabic >= 50) { strcat(OLstring, "l"); Arabic -= 50; while (Arabic >= 50) { strcat(OLstring, "x"); Arabic -= 10; } } if (Arabic >= 40) { strcat(OLstring, "xl"); Arabic -= 40; } while (Arabic > 10) { strcat(OLstring, "x"); Arabic -= 10; } switch (Arabic) { case 1: strcat(OLstring, "i."); break; case 2: strcat(OLstring, "ii."); break; case 3: strcat(OLstring, "iii."); break; case 4: strcat(OLstring, "iv."); break; case 5: strcat(OLstring, "v."); break; case 6: strcat(OLstring, "vi."); break; case 7: strcat(OLstring, "vii."); break; case 8: strcat(OLstring, "viii."); break; case 9: strcat(OLstring, "ix."); break; case 10: strcat(OLstring, "x."); break; default: strcat(OLstring, "."); break; } return OLstring; } /* ** This function initializes the Ordered List counter. - FM */ PUBLIC void LYZero_OL_Counter ARGS1( HTStructured *, me) { int i; if (!me) return; for (i = 0; i < 12; i++) { me->OL_Counter[i] = OL_VOID; me->OL_Type[i] = '1'; } me->Last_OL_Count = 0; me->Last_OL_Type = '1'; return; } /* ** This function is used by the HTML Structured object. - KW */ PUBLIC void LYGetChartransInfo ARGS1( HTStructured *, me) { me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_STRUCTURED); if (me->UCLYhndl < 0) { int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT); if (chndl < 0) { chndl = current_char_set; HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_HTEXT, UCT_SETBY_STRUCTURED); } HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_STRUCTURED, UCT_SETBY_STRUCTURED); me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_STRUCTURED); } me->UCI = HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED); } #ifdef NOTUSED_FOTEMODS /* ** This function reallocates an allocated string and converts ** characters for the current display character set. It assumes ** that invalid control characters have been dealt with by the ** SGML (or other initial) parser of the document input stream ** (i.e., are present only if elements or global flags have been ** set to allow them), and that otherwise this is a copy of the ** string with the charset of the input stream. It handles Lynx ** special characters based on the 'me' structure's element values ** (the me->UsePlainSpace and me->HiddenValue elements, and its ** chartrans related elements), and calls to other functions which ** return structure element values. HTChunk functions are used to ** keep memory allocations at a minimum. - FM */ PUBLIC void LYExpandString ARGS2( HTStructured *, me, char **, str) { char *p = *str; HTChunk *s; BOOLEAN plain_space, hidden; char c; unsigned char c_unsign; char saved_char_in = '\0'; BOOLEAN chk; UCode_t code, uck; char replace_buf [64]; char utf_buf[8], utf_count = 0; char *utf_buf_p = utf_buf; UCode_t utf_char = 0, value; CONST char *name; int i, j, high, low, diff = 0; /* ** Don't do anything if we have no structure ** or string, or are in CJK mode. - FM */ if (!me || !p || *p == '\0' || HTCJK != NOCJK) return; /* ** Set "convenience copies" of me structure ** elements. - FM */ plain_space = me->UsePlainSpace; hidden = me->HiddenValue; /* ** Check for special input charsets - FM */ if (!strcmp(me->inUCI->MIMEname, "x-transparent")) { /* ** Conversions not intended. - FM */ return; } if (!strcmp(me->inUCI->MIMEname, "mnemonic") || !strcmp(me->inUCI->MIMEname, "mnemonic+ascii+0")) { /* ** All ASCII representations of Unicode characters, ** and we have no reverse translation code for the ** multibyte characters, so punt. - FM */ return; } if (me->inUCLYhndl < 0 || me->outUCLYhndl < 0) { /* ** The chartrans procedure failed, so we don't ** do anything, and hope for the best. - FM */ CTRACE(tfp, "LYExpandString: Bad in (%d) or out (%d) handle(s).\n", me->inUCLYhndl, me->outUCLYhndl); return; } /* ** Zero the UTF-8 multibytes buffer. - FM */ utf_buf[0] = utf_buf[6] = utf_buf[7] = '\0'; /* ** Set up an HTChunk for accumulating the expanded copy ** of the string, so that allocations are done in 128 ** byte increments, only as required. - FM */ s = HTChunkCreate(128); /* ** Check each character in the original string, ** and add the characters or substitutions to ** our clean copy. - FM */ for (i = 0; p[i]; i++) { /* ** Make sure the character is handled as Unicode ** whenever that's appropriate. - FM */ c = p[i]; c_unsign = (unsigned char)c; code = (UCode_t)c_unsign; saved_char_in = '\0'; /* ** Combine any UTF-8 multibytes into Unicode ** to check for special characters. - FM */ if (me->T.decode_utf8) { /* ** Our input charset is UTF-8, so check ** for non-ASCII characters. - FM */ if (TOASCII(c_unsign) > 127) { /* S/390 -- gil -- 1703 */ /* ** We have an octet from a multibyte character. - FM */ if (utf_count > 0 && (c & 0xc0) == 0x80) { /* ** Adjust the UCode_t value, add the octet ** to the buffer, and decrement the byte ** count. - FM */ utf_char = (utf_char << 6) | (c & 0x3f); utf_count--; *utf_buf_p = c; utf_buf_p++; if (utf_count == 0) { /* ** We have all of the bytes, so terminate ** the buffer and set 'code' to the UCode_t ** value. - FM */ *utf_buf_p = '\0'; code = utf_char; /* ** Set up the monobyte character ** values or non-character flags ** and fall through. - FM */ if (code > 0 && code < 256) { c = ((char)(code & 0xff)); c_unsign = (unsigned char)c; } } else { /* ** Get the next byte. - FM */ continue; } } else { /* ** Start handling a new multibyte character. - FM */ utf_buf[0] = c; utf_buf_p = &utf_buf[1]; if ((c & 0xe0) == 0xc0) { utf_count = 1; utf_char = (c & 0x1f); } else if ((c & 0xf0) == 0xe0) { utf_count = 2; utf_char = (c & 0x0f); } else if ((c & 0xf8) == 0xf0) { utf_count = 3; utf_char = (c & 0x07); } else if ((c & 0xfc) == 0xf8) { utf_count = 4; utf_char = (c & 0x03); } else if ((c & 0xfe) == 0xfc) { utf_count = 5; utf_char = (c & 0x01); } else { /* ** We got garbage, even though it should ** have been filtered out by the SGML or ** input stream parser, so we'll ignore ** it. - FM */ utf_count = 0; utf_buf[0] = '\0'; utf_buf_p = utf_buf; } /* ** Get the next byte. - FM */ continue; } } else if (utf_count > 0) { /* ** Got an ASCII character when expecting ** UTF-8 multibytes, so ignore the buffered ** multibyte characters and fall through with ** the current ASCII character. - FM */ utf_count = 0; utf_buf[0] = '\0'; utf_buf_p = utf_buf; code = (UCode_t)c_unsign; } else { /* ** Got a valid ASCII character, so fall ** through with it. - FM */ code = (UCode_t)c_unsign; } } /* ** Convert characters from non-UTF-8 charsets ** to Unicode (if appropriate). - FM */ if (!(me->T.decode_utf8 && (unsigned char)p[i] > 127)) { #ifdef NOTDEFINED if (me->T.strip_raw_char_in) saved_char_in = c; #endif /* NOTDEFINED */ if (me->T.trans_to_uni && (code >= LYlowest_eightbit[me->inUCLYhndl] || (code < 32 && code != 0 && me->T.trans_C0_to_uni))) { /* ** Convert the octet to Unicode. - FM */ code = (UCode_t)UCTransToUni(c, me->inUCLYhndl); if (code > 0) { saved_char_in = c; if (code < 256) { c = ((char)(code & 0xff)); c_unsign = (unsigned char)c; } } } else if (code < ' ' && code != 0 && /* S/390 -- gil -- 1720 */ me->T.trans_C0_to_uni) { /* ** Quote from SGML.c: ** "This else if may be too ugly to keep. - KW" */ if (me->T.trans_from_uni && (((code = UCTransToUni(c, me->inUCLYhndl)) >= ' ') || /* S/390 -- gil -- 1737 */ (me->T.transp && (code = UCTransToUni(c, me->inUCLYhndl)) > 0))) { saved_char_in = c; if (code < 256) { c = ((char)(code & 0xff)); c_unsign = (unsigned char)c; } } else { uck = -1; if (me->T.transp) { uck = UCTransCharStr(replace_buf, 60, c, me->inUCLYhndl, me->inUCLYhndl, NO); } if (!me->T.transp || uck < 0) { uck = UCTransCharStr(replace_buf, 60, c, me->inUCLYhndl, me->outUCLYhndl, YES); } if (uck == 0) { continue; } else if (uck < 0) { utf_buf[0] = '\0'; code = (unsigned char)c; } else { c = replace_buf[0]; if (c && replace_buf[1]) { HTChunkPuts(s, replace_buf); continue; } } utf_buf[0] = '\0'; code = (unsigned char)c; } /* Next line end of ugly stuff for C0. - KW */ } else { utf_buf[0] = '\0'; code = (unsigned char)c; } } /* ** Ignore low ISO 646 7-bit control characters ** if they sneaked through (should have been ** filtered by the parser). - FM */ if (code < ' ' && /* S/390 -- gil -- 1754 */ c != 9 && c != 10 && c != 13) { continue; } /* ** Ignore 127 if we don't have HTPassHighCtrlRaw ** and it sneaked through (should have been ** filtered by the parser). - FM */ if (TOASCII(c) == 127 && /* S/390 -- gil -- 1771 */ !(me->T.transp || code >= LYlowest_eightbit[me->inUCLYhndl])) { continue; } /* ** Ignore 8-bit control characters 128 - 159 if we don't ** have HTPassHighCtrlRaw set and they sneaked through ** (should have been filtered by the parser). - FM */ if (TOASCII(code) > 127 && TOASCII(code) < 160 && /* S/390 -- gil -- 1788 */ !(me->T.transp || code >= LYlowest_eightbit[me->inUCLYhndl])) { continue; } /* ** For 160 (nbsp), substitute Lynx special character ** (or a space if plain_space or hidden is set) if ** HTPassHighCtrlRaw is not set. - FM */ if (code == CH_NBSP) { /* S/390 -- gil -- 1805 */ if (!me->T.pass_160_173_raw) { if (plain_space || hidden) { HTChunkPutc(s, ' '); } else { HTChunkPutc(s, HT_NON_BREAK_SPACE); } } else if (!me->T.output_utf8) { HTChunkPutc(s, ((char)(code & 0xff))); } else if (me->T.decode_utf8 && *utf_buf) { HTChunkPuts(s, utf_buf); utf_buf[0] == '\0'; utf_buf_p = utf_buf; } else { HTChunkPutUtf8Char(s, code); } continue; } /* ** For 173 (shy), substitute Lynx special character ** (or skip it if plain_space or hidden is set) if ** HTPassHighCtrlRaw is not set. - FM */ if (code == CH_SHY) { /* S/390 -- gil -- 1822 */ if (!me->T.pass_160_173_raw) { if (!(plain_space || hidden)) { HTChunkPutc(s, LY_SOFT_HYPHEN); } } else if (!me->T.output_utf8) { HTChunkPutc(s, ((char)(code & 0xff))); } else if (me->T.decode_utf8 && *utf_buf) { HTChunkPuts(s, utf_buf); utf_buf[0] == '\0'; utf_buf_p = utf_buf; } else { HTChunkPutUtf8Char(s, code); } continue; } /* ** For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use ** an ASCII space (32) if plain_space or hidden is TRUE, ** otherwise use the Lynx special character. - FM */ if (code == 8194 || code == 8195 || code == 8201) { if (plain_space || hidden) { HTChunkPutc(s, ' '); } else { HTChunkPutc(s, HT_EN_SPACE); } if (me->T.decode_utf8 && *utf_buf) { utf_buf[0] == '\0'; utf_buf_p = utf_buf; } continue; } /* ** If we want the raw character, pass it now. - FM */ if (me->T.use_raw_char_in && saved_char_in) { HTChunkPutc(s, saved_char_in); continue; } /* ** Seek a translation from the chartrans tables. */ if ((chk = (me->T.trans_from_uni && code >= 160)) && (uck = UCTransUniChar(code, me->outUCLYhndl)) >= 32 && uck < 256 && (uck < 127 || uck >= LYlowest_eightbit[me->outUCLYhndl])) { HTChunkPutc(s, ((char)(uck & 0xff))); continue; } else if (chk && (uck == -4 || (me->T.repl_translated_C0 && uck > 0 && uck < ' ')) && /* S/390 -- gil -- 1839 */ /* ** Not found; look for replacement string. */ (uck = UCTransUniCharStr(replace_buf, 60, code, me->outUCLYhndl, 0) >= 0)) { /* ** Got a replacement string. */ HTChunkPuts(s, replace_buf); continue; } /* ** If we want raw UTF-8, output that now. - FM */ if (me->T.output_utf8 && TOASCII(code) > 127 && code <= 0x7fffffffL) { /* S/390 -- gil -- 1856 */ if (me->T.decode_utf8 && *utf_buf) { HTChunkPuts(s, utf_buf); utf_buf[0] == '\0'; utf_buf_p = utf_buf; } else { HTChunkPutUtf8Char(s, code); } continue; } /* ** If it's any other (> 160) 8-bit character ** and we have not set HTPassEightBitRaw ** nor have the "ISO Latin 1" character set selected, ** back translate for our character set. - FM */ if (code > 160 && code < 256 && me->outUCLYhndl != LATIN1 && (!(HTPassEightBitRaw || (me->T.do_8bitraw && !me->T.trans_from_uni)))) { value = (code - 160); name = HTMLGetEntityName(value); for (low = 0, high = HTML_dtd.number_of_entities; high > low; diff < 0 ? (low = j+1) : (high = j)) { /* ** Binary search. */ j = (low + (high-low)/2); diff = strcmp(HTML_dtd.entity_names[j], name); if (diff == 0) { HTChunkPuts(s, LYCharSets[me->outUCLYhndl][j]); break; } } if (diff == 0) { continue; } } /* ** If it's ASCII at this point, use it. - FM */ if (TOASCII(code) < 127 && code > 0) { /* S/390 -- gil -- 1873 */ HTChunkPutc(s, ((char)(code & 0xff))); continue; } /* ** At this point, if we should have translated, the ** translation has failed. We should have sent UTF-8 ** output to the parser already, but what the heck, ** try again. - FM */ if (me->T.output_utf8 && *utf_buf) { HTChunkPuts(s, utf_buf); utf_buf[0] == '\0'; utf_buf_p = utf_buf; continue; } #ifdef NOTDEFINED /* ** Check for a strippable koi8-r 8-bit character. - FM */ if (me->T.strip_raw_char_in && (unsigned char)saved_char_in >= 192 && (unsigned char)saved_char_in < 255 && saved_char_in) { /* ** KOI8 special: strip high bit, gives (somewhat) readable ** ASCII or KOI7 - it was constructed that way! - KW */ HTChunkPutc(s, (saved_char_in & 0x7f)); continue; } #endif /* NOTDEFINED */ /* ** Ignore 8204 (zwnj), 8205 (zwj) ** 8206 (lrm), and 8207 (rlm), ** if we get to here. - FM */ if (code == 8204 || code == 8205 || code == 8206 || code == 8207) { CTRACE(tfp, "LYExpandString: Ignoring '%ld'.\n", code); if (me->T.decode_utf8 && *utf_buf) { utf_buf[0] == '\0'; utf_buf_p = utf_buf; } continue; } /* ** If we don't actually want the character, ** make it safe and output that now. - FM */ if ((c_unsign > 0 && c_unsign < LYlowest_eightbit[me->outUCLYhndl]) || (me->T.trans_from_uni && !HTPassEightBitRaw)) { /* ** If we do not have the "7-bit approximations" as our ** output character set (in which case we did it already) ** seek a translation for that. Otherwise, or if the ** translation fails, use UHHH notation. - FM */ if ((chk = (me->outUCLYhndl != UCGetLYhndl_byMIME("us-ascii"))) && (uck = UCTransUniChar(code, UCGetLYhndl_byMIME("us-ascii"))) >= ' ' && TOASCII(uck) < 127) { /* S/390 -- gil -- 1890 */ /* ** Got an ASCII character (yippey). - FM */ c = ((char)(uck & 0xff)); HTChunkPutc(s, c); continue; } else if ((uck == -4) && (uck = UCTransUniCharStr(replace_buf, 60, code, UCGetLYhndl_byMIME("us-ascii"), 0) >= 0)) { /* ** Got a replacement string (yippey). - FM */ HTChunkPuts(s, replace_buf); continue; } else { /* ** Out of luck, so use the UHHH notation (ugh). - FM */ sprintf(replace_buf, "U%.2lX", TOASCII(code)); /* S/390 -- gil -- 1907 */ HTChunkPuts(s, replace_buf); continue; } } /* ** If we get to here and have a monobyte character, ** pass it. - FM */ if (c_unsign > 0 && c_unsign < 256) { HTChunkPutc(s, c); } } /* ** Terminate the expanded string, ** replace the original, and free ** the chunk. - FM */ HTChunkTerminate(s); StrAllocCopy(*str, s->data); HTChunkFree(s); } #endif /* NOTUSED_FOTEMODS */ /* ** Get UCS character code for one character from UTF-8 encoded string. ** ** On entry: ** *ppuni should point to beginning of UTF-8 encoding character ** On exit: ** *ppuni is advanced to point to the last byte of UTF-8 sequence, ** if there was a valid one; otherwise unchanged. ** returns the UCS value ** returns negative value on error (invalid UTF-8 sequence) */ PRIVATE UCode_t UCGetUniFromUtf8String ARGS1(char **, ppuni) { UCode_t uc_out = 0; char * p = *ppuni; int utf_count, i; if (!(**ppuni&0x80)) return (UCode_t) **ppuni; /* ASCII range character */ else if (!(**ppuni&0x40)) return (-1); /* not a valid UTF-8 start */ if ((*p & 0xe0) == 0xc0) { utf_count = 1; } else if ((*p & 0xf0) == 0xe0) { utf_count = 2; } else if ((*p & 0xf8) == 0xf0) { utf_count = 3; } else if ((*p & 0xfc) == 0xf8) { utf_count = 4; } else if ((*p & 0xfe) == 0xfc) { utf_count = 5; } else { /* garbage */ return (-1); } for (p = *ppuni, i = 0; i < utf_count ; i++) { if ((*(++p) & 0xc0) != 0x80) return (-1); } p = *ppuni; switch (utf_count) { case 1: uc_out = (((*p&0x1f) << 6) | (*(p+1)&0x3f)); break; case 2: uc_out = (((((*p&0x0f) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)); break; case 3: uc_out = (((((((*p&0x07) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6) | (*(p+3)&0x3f)); break; case 4: uc_out = (((((((((*p&0x03) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6) | (*(p+3)&0x3f)) << 6) | (*(p+4)&0x3f)); break; case 5: uc_out = (((((((((((*p&0x01) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6) | (*(p+3)&0x3f)) << 6) | (*(p+4)&0x3f)) << 6) | (*(p+5)&0x3f)); break; } *ppuni = p + utf_count; return uc_out; } /* * Given an UCS character code, will fill buffer passed in as q with * the code's UTF-8 encoding. * If terminate = YES, terminates string on success and returns pointer * to beginning. * If terminate = NO, does not terminate string, and returns pointer * next char after the UTF-8 put into buffer. * On failure, including invalid code or 7-bit code, returns NULL. */ PRIVATE char * UCPutUtf8ToBuffer ARGS3(char *, q, UCode_t, code, BOOL, terminate) { char *q_in = q; if (!q) return NULL; if (code > 127 && code < 0x7fffffffL) { if (code < 0x800L) { *q++ = (char)(0xc0 | (code>>6)); *q++ = (char)(0x80 | (0x3f & (code))); } else if (code < 0x10000L) { *q++ = (char)(0xe0 | (code>>12)); *q++ = (char)(0x80 | (0x3f & (code>>6))); *q++ = (char)(0x80 | (0x3f & (code))); } else if (code < 0x200000L) { *q++ = (char)(0xf0 | (code>>18)); *q++ = (char)(0x80 | (0x3f & (code>>12))); *q++ = (char)(0x80 | (0x3f & (code>>6))); *q++ = (char)(0x80 | (0x3f & (code))); } else if (code < 0x4000000L) { *q++ = (char)(0xf8 | (code>>24)); *q++ = (char)(0x80 | (0x3f & (code>>18))); *q++ = (char)(0x80 | (0x3f & (code>>12))); *q++ = (char)(0x80 | (0x3f & (code>>6))); *q++ = (char)(0x80 | (0x3f & (code))); } else { *q++ = (char)(0xfc | (code>>30)); *q++ = (char)(0x80 | (0x3f & (code>>24))); *q++ = (char)(0x80 | (0x3f & (code>>18))); *q++ = (char)(0x80 | (0x3f & (code>>12))); *q++ = (char)(0x80 | (0x3f & (code>>6))); *q++ = (char)(0x80 | (0x3f & (code))); } } else { return NULL; } if (terminate) { *q = '\0'; return q_in; } else { return q; } } /* as in HTParse.c, saves some calls - kw */ PRIVATE CONST char *hex = "0123456789ABCDEF"; /* * Any raw 8-bit or multibyte characters already have been * handled in relation to the display character set * in SGML_character(), including named and numeric entities. * ** This function used for translations HTML special fields inside tags ** (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'. ** It also unescapes non-ASCII characters from URL (#fragments !) ** if st_URL is active. ** ** If `do_ent' is YES, it converts named entities ** and numeric character references (NCRs) to their `cs_to' replacements. ** ** Named entities converted to unicodes. NCRs (unicodes) converted ** by UCdomap.c chartrans functions. ** ???NCRs with values in the ISO-8859-1 range 160-255 may be converted ** to their HTML entity names (via old-style entities) and then translated ** according to the LYCharSets.c array for `cs_out'???. ** ** Some characters (see descriptions in `put_special_unicodes' from SGML.c) ** translated in relation with the state of boolean variables ** `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet: ** ** If plain_space is TRUE, nbsp (160) will be treated as an ASCII ** space (32). If hidden is TRUE, entities will be translated ** (if `do_ent' is YES) but escape sequences will be passed unaltered. ** If `hidden' is FALSE, some characters are converted to Lynx special ** codes (see `put_special_unicodes') or ASCII space if `plain_space' ** applies). @@ is `use_lynx_specials' needed, does it have any effect? @@ ** If `use_lynx_specials' is YES, translate byte values 160 and 173 ** meaning U+00A0 and U+00AD given as or converted from raw char input ** are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively ** (unless input and output charset are both iso-8859-1, for compatibility ** with previous usage in HTML.c) even if `hidden' or `plain_space' is set. ** ** If `Back' is YES, the reverse is done instead i.e., Lynx special codes ** in the input are translated back to character values. ** ** If `Back' is YES, an attempt is made to use UCReverseTransChar() for ** back translation which may be more efficient. (?) ** ** If `stype' is st_URL, non-ASCII characters are URL-encoded instead. ** The sequence of bytes being URL-encoded is the raw input character if ** we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the ** UTF-8 representation if either `cs_to' requires this or if the ** character's Unicode value is > 255, otherwise it should be the iso-8859-1 ** representation. ** No general URL-encoding occurs for displayable ASCII characters and ** spaces and some C0 controls valid in HTML (LF, TAB), it is expected ** that other functions will take care of that as appropriate. ** ** Escape characters (0x1B, '\033') are ** - URL-encoded if `stype' is st_URL, otherwise ** - dropped if `stype' is st_other, otherwise (i.e., st_HTML) ** - passed if `hidden' is TRUE or HTCJK is set, otherwise ** - dropped. ** ** (If `stype' is st_URL or st_other most of the parameters really predefined: ** cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES) ** ** ** Returns pointer to the char** passed in ** if string translated or translation unnecessary, ** NULL otherwise ** (in which case something probably went wrong.) ** ** ** In general, this somehow ugly function (KW) ** cover three functions from v.2.7.2 (FM): ** extern void LYExpandString PARAMS(( ** HTStructured * me, ** char ** str)); ** extern void LYUnEscapeEntities PARAMS(( ** HTStructured * me, ** char ** str)); ** extern void LYUnEscapeToLatinOne PARAMS(( ** HTStructured * me, ** char ** str, ** BOOLEAN isURL)); */ PRIVATE char ** LYUCFullyTranslateString_1 ARGS9( char **, str, int, cs_from, int, cs_to, BOOLEAN, do_ent, BOOL, use_lynx_specials, BOOLEAN, plain_space, BOOLEAN, hidden, BOOL, Back, CharUtil_st, stype) { char * p; char *q, *qs; HTChunk *chunk = NULL; char * cp = 0; char cpe = 0; char *esc = NULL; char replace_buf [64]; int uck; int lowest_8; UCode_t code = 0; long int lcode; BOOL output_utf8 = 0, repl_translated_C0 = 0; size_t len; CONST char * name = NULL; BOOLEAN no_bytetrans; UCTransParams T; BOOL from_is_utf8 = FALSE; char * puni; enum _state { S_text, S_esc, S_dollar, S_paren, S_nonascii_text, S_dollar_paren, S_trans_byte, S_check_ent, S_ncr, S_check_uni, S_named, S_check_name, S_recover, S_got_oututf8, S_got_outstring, S_put_urlstring, S_got_outchar, S_put_urlchar, S_next_char, S_done} state = S_text; enum _parsing_what { P_text, P_utf8, P_hex, P_decimal, P_named } what = P_text; /* ** Make sure we have a non-empty string. - FM */ if (!str || *str == NULL || **str == '\0') return str; /* * FIXME: something's wrong with the limit checks here (clearing the * buffer helps). */ memset(replace_buf, 0, sizeof(replace_buf)); /* ** Don't do byte translation ** if original AND target character sets ** are both iso-8859-1, ** or if we are in CJK mode. */ no_bytetrans = ((cs_to <= 0 && cs_from == cs_to) || HTCJK != NOCJK); /* No need to translate or examine the string any further */ if (!no_bytetrans) no_bytetrans = (!use_lynx_specials && !Back && UCNeedNotTranslate(cs_from, cs_to)); /* ** Save malloc/calloc overhead in simple case - kw */ if (do_ent && hidden && (stype != st_URL) && (strchr(*str, '&') == NULL)) do_ent = FALSE; /* Can't do, caller should figure out what to do... */ if (!UCCanTranslateFromTo(cs_from, cs_to)) { if (cs_to < 0) return NULL; if (!do_ent && no_bytetrans) return NULL; no_bytetrans = TRUE; } else if (cs_to < 0) { do_ent = FALSE; } if (!do_ent && no_bytetrans) return str; p = *str; if (!no_bytetrans) { UCTransParams_clear(&T); UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from], cs_to, &LYCharSet_UC[cs_to]); from_is_utf8 = (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8); output_utf8 = T.output_utf8; repl_translated_C0 = T.repl_translated_C0; puni = p; } else if (do_ent) { output_utf8 = (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 || HText_hasUTF8OutputSet(HTMainText)); repl_translated_C0 = (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0); } lowest_8 = LYlowest_eightbit[cs_to]; /* ** Create a buffer string seven times the length of the original, ** so we have plenty of room for expansions. - FM */ len = strlen(p) + 16; q = p; qs = q; /* Create the HTChunk only if we need it */ #define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1))) #define REPLACE_STRING(s) \ if (q != qs) HTChunkPutb(CHUNK, qs, q-qs); \ HTChunkPuts(CHUNK, s); \ qs = q = *str #define REPLACE_CHAR(c) if (q > p) { \ HTChunkPutb(CHUNK, qs, q-qs); \ qs = q = *str; \ *q++ = c; \ } else \ *q++ = c /* * Loop through string, making conversions as needed. * * The while() checks for a non-'\0' char only for the normal * text states since other states may temporarily modify p or *p * (which should be restored before S_done!) - kw */ while (*p || (state != S_text && state != S_nonascii_text)) { switch(state) { case S_text: code = (unsigned char)(*p); if (*p == '\033') { if ((HTCJK != NOCJK && !hidden) || stype != st_HTML) { state = S_esc; if (stype == st_URL) { REPLACE_STRING("%1B"); p++; continue; } else if (stype != st_HTML) { p++; continue; } else { *q++ = *p++; continue; } } else if (!hidden) { /* ** CJK handling not on, and not a hidden INPUT, ** so block escape. - FM */ state = S_next_char; } else { state = S_trans_byte; } } else { state = (do_ent ? S_check_ent : S_trans_byte); } break; case S_esc: if (*p == '$') { state = S_dollar; *q++ = *p++; continue; } else if (*p == '(') { state = S_paren; *q++ = *p++; continue; } else { state = S_text; } case S_dollar: if (*p == '@' || *p == 'B' || *p == 'A') { state = S_nonascii_text; *q++ = *p++; continue; } else if (*p == '(') { state = S_dollar_paren; *q++ = *p++; continue; } else { state = S_text; } break; case S_dollar_paren: if (*p == 'C') { state = S_nonascii_text; *q++ = *p++; continue; } else { state = S_text; } break; case S_paren: if (*p == 'B' || *p == 'J' || *p == 'T') { state = S_text; *q++ = *p++; continue; } else if (*p == 'I') { state = S_nonascii_text; *q++ = *p++; continue; } else { state = S_text; } break; case S_nonascii_text: if (*p == '\033') { if ((HTCJK != NOCJK && !hidden) || stype != st_HTML) { state = S_esc; if (stype == st_URL) { REPLACE_STRING("%1B"); p++; continue; } else if (stype != st_HTML) { p++; continue; } } } *q++ = *p++; continue; case S_trans_byte: /* character translation goes here */ /* ** Don't do anything if we have no string, ** or if original AND target character sets ** are both iso-8859-1, ** or if we are in CJK mode. */ if (*p == '\0' || no_bytetrans) { state = S_got_outchar; break; } if (Back) { int rev_c; if ((*p) == HT_NON_BREAK_SPACE || (*p) == HT_EN_SPACE) { if (plain_space) { code = *p = ' '; state = S_got_outchar; break; } else { *p = 160; code = 160; if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) { state = S_got_outchar; break; } } } else if ((*p) == LY_SOFT_HYPHEN) { *p = 173; code = 173; if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) { state = S_got_outchar; break; } } else if (code < 127 || T.transp) { state = S_got_outchar; break; } rev_c = UCReverseTransChar(*p, cs_to, cs_from); if (rev_c > 127) { *p = rev_c; code = rev_c; state = S_got_outchar; break; } } else if (code < 127) { state = S_got_outchar; break; } if (from_is_utf8) { if (((*p)&0xc0)==0xc0) { puni = p; code = UCGetUniFromUtf8String(&puni); if (code <= 0) { code = (unsigned char)(*p); } else { what = P_utf8; } } } else if (use_lynx_specials && !Back && (code == 160 || code == 173) && (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) { if (code == 160) code = *p = HT_NON_BREAK_SPACE; else if (code == 173) code = *p = LY_SOFT_HYPHEN; state = S_got_outchar; break; } else if (T.trans_to_uni) { code = UCTransToUni(*p, cs_from); if (code <= 0) { /* What else can we do? */ code = (unsigned char)(*p); } #ifdef NOTUSED_FOTEMODS } else if (T.strip_raw_char_in && (unsigned char)(*p) >= 0xc0 && (unsigned char)(*p) < 255) { code = ((*p & 0x7f)); state = S_got_outchar; break; #endif /* NOTUSED_FOTEMODS */ } else if (!T.trans_from_uni) { state = S_got_outchar; break; } /* ** Substitute Lynx special character for ** 160 (nbsp) if use_lynx_specials is set. */ if (use_lynx_specials && !Back && (code == 160 || code == 173)) { code = ((code==160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN)); state = S_got_outchar; break; } state = S_check_uni; break; case S_check_ent: if (*p == '&') { char * pp = p + 1; len = strlen(pp); /* ** Check for a numeric entity. - FM */ if (*pp == '#' && len > 2 && (*(pp+1) == 'x' || *(pp+1) == 'X') && (unsigned char)*(pp+2) < 127 && isxdigit((unsigned char)*(pp+2))) { what = P_hex; state = S_ncr; } else if (*pp == '#' && len > 2 && (unsigned char)*(pp+1) < 127 && isdigit((unsigned char)*(pp+1))) { what = P_decimal; state = S_ncr; } else if ((unsigned char)*pp < 127 && isalpha((unsigned char)*pp)) { what = P_named; state = S_named; } else { state = S_trans_byte; } } else { state = S_trans_byte; } break; case S_ncr: if (what == P_hex) { p += 3; } else { /* P_decimal */ p += 2; } cp = p; while (*p && (unsigned char)*p < 127 && (what == P_hex ? isxdigit((unsigned char)*p) : isdigit((unsigned char)*p))) { p++; } /* ** Save the terminator and isolate the digit(s). - FM */ cpe = *p; if (*p) *p++ = '\0'; /* ** Show the numeric entity if the value: ** (1) Is greater than 255 and unhandled Unicode. ** (2) Is less than 32, and not valid and we don't ** have HTCJK set. ** (3) Is 127 and we don't have HTPassHighCtrlRaw ** or HTCJK set. ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set. */ if ((((what == P_hex) ? sscanf(cp, "%lx", &lcode) : sscanf(cp, "%ld", &lcode)) != 1) || lcode > 0x7fffffffL || lcode < 0) { state = S_recover; break; } else { code = lcode; if ((code == 1) || (code > 127 && code < 156)) { /* ** Assume these are Microsoft code points, inflicted on ** us by FrontPage. - FM ** ** MS FrontPage uses syntax like ™ in 128-159 ** range and doesn't follow Unicode standards for this ** area. Windows-1252 codepoints are assumed here. */ switch (code) { case 1: /* ** WHITE SMILING FACE */ code = 0x263a; break; case 128: /* ** EURO currency sign */ code = 0x20ac; break; case 130: /* ** SINGLE LOW-9 QUOTATION MARK (sbquo) */ code = 0x201a; break; case 132: /* ** DOUBLE LOW-9 QUOTATION MARK (bdquo) */ code = 0x201e; break; case 133: /* ** HORIZONTAL ELLIPSIS (hellip) */ code = 0x2026; break; case 134: /* ** DAGGER (dagger) */ code = 0x2020; break; case 135: /* ** DOUBLE DAGGER (Dagger) */ code = 0x2021; break; case 137: /* ** PER MILLE SIGN (permil) */ code = 0x2030; break; case 139: /* ** SINGLE LEFT-POINTING ANGLE QUOTATION MARK ** (lsaquo) */ code = 0x2039; break; case 145: /* ** LEFT SINGLE QUOTATION MARK (lsquo) */ code = 0x2018; break; case 146: /* ** RIGHT SINGLE QUOTATION MARK (rsquo) */ code = 0x2019; break; case 147: /* ** LEFT DOUBLE QUOTATION MARK (ldquo) */ code = 0x201c; break; case 148: /* ** RIGHT DOUBLE QUOTATION MARK (rdquo) */ code = 0x201d; break; case 149: /* ** BULLET (bull) */ code = 0x2022; break; case 150: /* ** EN DASH (ndash) */ code = 0x2013; break; case 151: /* ** EM DASH (mdash) */ code = 0x2014; break; case 152: /* ** SMALL TILDE (tilde) */ code = 0x02dc; break; case 153: /* ** TRADE MARK SIGN (trade) */ code = 0x2122; break; case 155: /* ** SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ** (rsaquo) */ code = 0x203a; break; default: /* ** Do not attempt a conversion ** to valid Unicode values. */ break; } } state = S_check_uni; } break; case S_check_uni: /* ** Show the numeric entity if the value: ** (2) Is less than 32, and not valid and we don't ** have HTCJK set. ** (3) Is 127 and we don't have HTPassHighCtrlRaw ** or HTCJK set. ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set. */ if ((code < 32 && code != 9 && code != 10 && code != 13 && HTCJK == NOCJK) || (code == 127 && !(HTPassHighCtrlRaw || HTCJK != NOCJK)) || (code > 127 && code < 160 && !HTPassHighCtrlNum)) { state = S_recover; break; } /* ** Convert the value as an unsigned char, ** hex escaped if isURL is set and it's ** 8-bit, and then recycle the terminator ** if it is not a semicolon. - FM */ if (code > 159 && stype == st_URL) { state = S_got_oututf8; break; } /* ** For 160 (nbsp), use that value if it's ** a hidden INPUT, otherwise use an ASCII ** space (32) if plain_space is TRUE, ** otherwise use the Lynx special character. - FM */ if (code == 160) { if (hidden) { ; } else if (plain_space) { code = ' '; } else { code = HT_NON_BREAK_SPACE; } state = S_got_outchar; break; } /* ** For 173 (shy), use that value if it's ** a hidden INPUT, otherwise ignore it ** if plain_space is TRUE, otherwise use ** the Lynx special character. - FM */ if (code == 173) { if (plain_space) { replace_buf[0] = '\0'; state = S_got_outstring; break; } else if (Back && !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL))) { ; /* nothing, may be translated later */ } else if (hidden || Back) { state = S_got_outchar; break; } else { code = LY_SOFT_HYPHEN; state = S_got_outchar; break; } } /* ** Seek a translation from the chartrans tables. */ if ((uck = UCTransUniChar(code, cs_to)) >= 32 && uck < 256 && (uck < 127 || uck >= lowest_8)) { code = uck; state = S_got_outchar; break; } else if ((uck == -4 || (repl_translated_C0 && uck > 0 && uck < 32)) && /* ** Not found; look for replacement string. */ (uck = UCTransUniCharStr(replace_buf, 60, code, cs_to, 0) >= 0)) { state = S_got_outstring; break; } if (output_utf8 && code > 127 && code < 0x7fffffffL) { state = S_got_oututf8; break; } /* ** For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), ** use the character reference if it's a hidden INPUT, ** otherwise use an ASCII space (32) if plain_space is ** TRUE, otherwise use the Lynx special character. - FM */ if (code == 8194 || code == 8195 || code == 8201) { if (hidden) { state = S_recover; } else if (plain_space) { code = ' '; state = S_got_outchar; } else { code = HT_EN_SPACE; state = S_got_outchar; } break; /* ** Ignore 8204 (zwnj), 8205 (zwj) ** 8206 (lrm), and 8207 (rlm), ** for now, if we got this far without ** finding a representation for them. */ } else if (code == 8204 || code == 8205 || code == 8206 || code == 8207) { CTRACE(tfp, "LYUCFullyTranslateString: Ignoring '%ld'.\n", code); replace_buf[0] = '\0'; state = S_got_outstring; break; /* ** Show the numeric entity if the value: ** (1) Is greater than 255 and unhandled Unicode. */ } else if (code > 255) { /* ** Illegal or not yet handled value. ** Return "&#" verbatim and continue ** from there. - FM */ state = S_recover; break; /* ** If it's ASCII, or is 8-bit but HTPassEightBitNum ** is set or the character set is "ISO Latin 1", ** use it's value. - FM */ } else if (code < 161 || (code < 256 && (HTPassEightBitNum || cs_to == LATIN1))) { /* ** No conversion needed. */ state = S_got_outchar; break; /* ** If we get to here, convert and handle ** the character as a named entity. - FM */ } else { name = HTMLGetEntityName(code - 160); state = S_check_name; break; } case S_recover: if (what == P_decimal || what == P_hex) { /* ** Illegal or not yet handled value. ** Return "&#" verbatim and continue ** from there. - FM */ *q++ = '&'; *q++ = '#'; if (what == P_hex) *q++ = 'x'; if (cpe != '\0') *(p-1) = cpe; p = cp; state = S_done; } else if (what == P_named) { *cp = cpe; *q++ = '&'; state = S_done; #ifdef NOTUSED_FOTEMODS } else if (T.strip_raw_char_in && (unsigned char)(*p) >= 0xc0 && (unsigned char)(*p) < 255) { code = (((*p) & 0x7f)); state = S_got_outchar; #endif /* NOTUSED_FOTEMODS */ } else if (!T.output_utf8 && stype == st_HTML && !hidden && !(HTPassEightBitRaw && (unsigned char)(*p) >= lowest_8)) { sprintf(replace_buf, "U%.2lX", code); state = S_got_outstring; } else { puni = p; code = (unsigned char)(*p); state = S_got_outchar; } break; case S_named: cp = ++p; while (*cp && (unsigned char)*cp < 127 && isalnum((unsigned char)*cp)) cp++; cpe = *cp; *cp = '\0'; /* ppuni = cp - 1; */ name = p; state = S_check_name; break; case S_check_name: /* ** Seek the Unicode value for the named entity. ** ** !!!! We manually recover the case of '=' terminator which ** is commonly found on query to CGI-scripts ** enclosed as href= URLs like "somepath/?x=1&yz=2" ** Without this dirty fix, submission of such URLs was broken ** if &yz string happened to be a recognized entity name. - LP */ if ( ((code = HTMLGetEntityUCValue(name)) > 0) && !((cpe == '=') && (stype == st_URL)) ) { state = S_check_uni; break; } /* ** Didn't find the entity. ** Return verbatim. */ state = S_recover; break; /* * * O U T P U T S T A T E S * * */ case S_got_oututf8: if (code > 255 || (code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) { UCPutUtf8ToBuffer(replace_buf, code, YES); state = S_got_outstring; } else { state = S_got_outchar; } break; case S_got_outstring: if (what == P_decimal || what == P_hex) { if (cpe != ';' && cpe != '\0') *(--p) = cpe; p--; } else if (what == P_named) { *cp = cpe; p = (*cp != ';') ? (cp - 1) : cp; } else if (what == P_utf8) { p = puni; } if (replace_buf[0] == '\0') { state = S_next_char; break; } if (stype == st_URL) { code = replace_buf[0]; /* assume string OK if first char is */ if (code >= 127 || (code < 32 && (code != 9 && code != 10 && code != 0))) { state = S_put_urlstring; break; } } REPLACE_STRING(replace_buf); state = S_next_char; break; case S_put_urlstring: esc = HTEscape(replace_buf, URL_XALPHAS); REPLACE_STRING(esc); FREE(esc); state = S_next_char; break; case S_got_outchar: if (what == P_decimal || what == P_hex) { if (cpe != ';' && cpe != '\0') *(--p) = cpe; p--; } else if (what == P_named) { *cp = cpe; p = (*cp != ';') ? (cp - 1) : cp; } else if (what == P_utf8) { p = puni; } if (stype == st_URL && /* Not a full HTEscape, only for 8bit and ctrl chars */ (TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */ (code < ' ' && (code != '\t' && code != '\n')))) { state = S_put_urlchar; break; } else if (!hidden && code == 10 && *p == 10 && q != qs && *(q-1) == 13) { /* ** If this is not a hidden string, and the current char is ** the LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW */ *(q-1) = *p++; state = S_done; break; } *q++ = (char)code; state = S_next_char; break; case S_put_urlchar: *q++ = '%'; REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */ REPLACE_CHAR(hex[(TOASCII(code) & 15)]); /* fall through */ case S_next_char: p++; /* fall through */ case S_done: state = S_text; what = P_text; /* for next round */ } } *q = '\0'; if (chunk) { HTChunkPutb(CHUNK, qs, q-qs + 1); /* also terminates */ if (stype == st_URL) { LYTrimHead(chunk->data); LYTrimTail(chunk->data); } StrAllocCopy(*str, chunk->data); HTChunkFree(chunk); } else { if (stype == st_URL) { LYTrimHead(qs); LYTrimTail(qs); } } return str; } #undef REPLACE_CHAR #undef REPLACE_STRING PUBLIC BOOL LYUCFullyTranslateString ARGS7( char **, str, int, cs_from, int, cs_to, BOOL, use_lynx_specials, BOOLEAN, plain_space, BOOLEAN, hidden, CharUtil_st, stype) { BOOL ret = YES; /* May reallocate *str even if cs_to == 0 */ if (!LYUCFullyTranslateString_1(str, cs_from, cs_to, TRUE, use_lynx_specials, plain_space, hidden, NO, stype)) { ret = NO; } return ret; } PUBLIC BOOL LYUCTranslateBackFormData ARGS4( char **, str, int, cs_from, int, cs_to, BOOLEAN, plain_space) { char ** ret; /* May reallocate *str */ ret = (LYUCFullyTranslateString_1(str, cs_from, cs_to, FALSE, NO, plain_space, YES, YES, st_HTML)); return (ret != NULL); } /* ** This function processes META tags in HTML streams. - FM */ PUBLIC void LYHandleMETA ARGS4( HTStructured *, me, CONST BOOL*, present, CONST char **, value, char **, include GCC_UNUSED) { char *http_equiv = NULL, *name = NULL, *content = NULL; char *href = NULL, *id_string = NULL, *temp = NULL; char *cp, *cp0, *cp1 = NULL; int url_type = 0; int i; /* loop counter for meta tag wrapping - RDP */ if (!me || !present) return; /* * Load the attributes for possible use by Lynx. - FM */ if (present[HTML_META_HTTP_EQUIV] && value[HTML_META_HTTP_EQUIV] && *value[HTML_META_HTTP_EQUIV]) { StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]); convert_to_spaces(http_equiv, TRUE); LYUCFullyTranslateString(&http_equiv, me->tag_charset, me->tag_charset, NO, NO, YES, st_other); LYTrimHead(http_equiv); LYTrimTail(http_equiv); if (*http_equiv == '\0') { FREE(http_equiv); } } if (present[HTML_META_NAME] && value[HTML_META_NAME] && *value[HTML_META_NAME]) { StrAllocCopy(name, value[HTML_META_NAME]); convert_to_spaces(name, TRUE); LYUCFullyTranslateString(&name, me->tag_charset, me->tag_charset, NO, NO, YES, st_other); LYTrimHead(name); LYTrimTail(name); if (*name == '\0') { FREE(name); } } if (present[HTML_META_CONTENT] && value[HTML_META_CONTENT] && *value[HTML_META_CONTENT]) { /* * Technically, we should be creating a comma-separated * list, but META tags come one at a time, and we'll * handle (or ignore) them as each is received. Also, * at this point, we only trim leading and trailing * blanks from the CONTENT value, without translating * any named entities or numeric character references, * because how we should do that depends on what type * of information it contains, and whether or not any * of it might be sent to the screen. - FM */ StrAllocCopy(content, value[HTML_META_CONTENT]); convert_to_spaces(content, FALSE); LYTrimHead(content); LYTrimTail(content); if (*content == '\0') { FREE(content); } } CTRACE(tfp, "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\"\n", (http_equiv ? http_equiv : "NULL"), (name ? name : "NULL"), (content ? content : "NULL")); /* * Make sure we have META name/value pairs to handle. - FM */ if (!(http_equiv || name) || !content) goto free_META_copies; /* * Check for and deal with name=keywords. * For use with -traversal to enhance the resulting index. - RDP */ if (name && !strcasecomp(name,"keywords")) { if (meta_keywords != NULL) { FREE(meta_keywords); } StrAllocCopy(meta_keywords, content); /* * Guard against longer lines than operating system will accept, * by changing each space character to a newline, and each comma * that is not followed by a space likewise. - RDP */ for (i=0; i < strlen(meta_keywords) - 1; i++) { if (meta_keywords[i] == ' ' || (meta_keywords[i] == ',' && meta_keywords[i+1] != ' ')) { meta_keywords[i] = '\n'; } } /* * Check for and deal with name=description. * For use with -traversal to enhance the resulting index. - RDP */ } else if (name && !strcasecomp(name,"description")) { if (meta_description != NULL) { FREE(meta_description); } StrAllocCopy(meta_description, content); /* * Guard against longer lines than operating system will accept, * by changing each space character to a newline, and each comma * that is not followed by a space likewise. - RDP */ for (i=0; i < strlen(meta_description) - 1; i++) { if (meta_description[i] == ' '|| (meta_description[i] == ',' && meta_description[i+1] != ' ')) { meta_description[i] = '\n'; } } /* * Check for a no-cache Pragma * or Cache-Control directive. - FM */ } else if (!strcasecomp((http_equiv ? http_equiv : ""), "Pragma") || !strcasecomp((http_equiv ? http_equiv : ""), "Cache-Control")) { LYUCFullyTranslateString(&content, me->tag_charset, me->tag_charset, NO, NO, YES, st_other); LYTrimHead(content); LYTrimTail(content); if (!strcasecomp(content, "no-cache")) { me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); } /* * If we didn't get a Cache-Control MIME header, * and the META has one, convert to lowercase, * store it in the anchor element, and if we * haven't yet set no_cache, check whether we * should. - FM */ if ((!me->node_anchor->cache_control) && !strcasecomp((http_equiv ? http_equiv : ""), "Cache-Control")) { LYLowerCase(content); StrAllocCopy(me->node_anchor->cache_control, content); if (me->node_anchor->no_cache == FALSE) { cp0 = content; while ((cp = strstr(cp0, "no-cache")) != NULL) { cp += 8; while (*cp != '\0' && WHITE(*cp)) cp++; if (*cp == '\0' || *cp == ';') { me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); break; } cp0 = cp; } if (me->node_anchor->no_cache == TRUE) goto free_META_copies; cp0 = content; while ((cp = strstr(cp0, "max-age")) != NULL) { cp += 7; while (*cp != '\0' && WHITE(*cp)) cp++; if (*cp == '=') { cp++; while (*cp != '\0' && WHITE(*cp)) cp++; if (isdigit((unsigned char)*cp)) { cp0 = cp; while (isdigit((unsigned char)*cp)) cp++; if (*cp0 == '0' && cp == (cp0 + 1)) { me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); break; } } } cp0 = cp; } } } /* * Check for an Expires directive. - FM */ } else if (!strcasecomp((http_equiv ? http_equiv : ""), "Expires")) { /* * If we didn't get an Expires MIME header, * store it in the anchor element, and if we * haven't yet set no_cache, check whether we * should. Note that we don't accept a Date * header via META tags, because it's likely * to be untrustworthy, but do check for a * Date header from a server when making the * comparison. - FM */ LYUCFullyTranslateString(&content, me->tag_charset, me->tag_charset, NO, NO, YES, st_other); LYTrimHead(content); LYTrimTail(content); StrAllocCopy(me->node_anchor->expires, content); if (me->node_anchor->no_cache == FALSE) { if (!strcmp(content, "0")) { /* * The value is zero, which we treat as * an absolute no-cache directive. - FM */ me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); } else if (me->node_anchor->date != NULL) { /* * We have a Date header, so check if * the value is less than or equal to * that. - FM */ if (LYmktime(content, TRUE) <= LYmktime(me->node_anchor->date, TRUE)) { me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); } } else if (LYmktime(content, FALSE) == 0) { /* * Removed < from test to satisfy compiler. - RDP 7/7/00 * * We don't have a Date header, and * the value is in past for us. - FM */ me->node_anchor->no_cache = TRUE; HText_setNoCache(me->text); } } /* * Check for a text/html Content-Type with a * charset directive, if we didn't already set * the charset via a server's header. - AAC & FM */ } else if (!(me->node_anchor->charset && *me->node_anchor->charset) && !strcasecomp((http_equiv ? http_equiv : ""), "Content-Type")) { LYUCcharset * p_in = NULL; LYUCcharset * p_out = NULL; LYUCFullyTranslateString(&content, me->tag_charset, me->tag_charset, NO, NO, YES, st_other); LYTrimHead(content); LYTrimTail(content); LYLowerCase(content); if ((cp = strstr(content, "text/html;")) != NULL && (cp1 = strstr(content, "charset")) != NULL && cp1 > cp) { BOOL chartrans_ok = NO; char *cp3 = NULL, *cp4; int chndl; cp1 += 7; while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"') cp1++; StrAllocCopy(cp3, cp1); /* copy to mutilate more */ for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' && *cp4 != ';' && *cp4 != ':' && !WHITE(*cp4)); cp4++) { ; /* do nothing */ } *cp4 = '\0'; cp4 = cp3; chndl = UCGetLYhndl_byMIME(cp3); if (UCCanTranslateFromTo(chndl, current_char_set)) { chartrans_ok = YES; StrAllocCopy(me->node_anchor->charset, cp4); HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_PARSER, UCT_SETBY_STRUCTURED); } else if (chndl < 0) { /* * Got something but we don't recognize it. */ chndl = UCLYhndl_for_unrec; if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */ chndl = UCLYhndl_for_unspec; /* always >= 0 */ if (UCCanTranslateFromTo(chndl, current_char_set)) { chartrans_ok = YES; HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_PARSER, UCT_SETBY_STRUCTURED); } } if (chartrans_ok) { p_in = HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_PARSER); p_out = HTAnchor_setUCInfoStage(me->node_anchor, current_char_set, UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); if (!p_out) { /* * Try again. */ p_out = HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_HTEXT); } if (!strcmp(p_in->MIMEname, "x-transparent")) { HTPassEightBitRaw = TRUE; HTAnchor_setUCInfoStage(me->node_anchor, HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT), UCT_STAGE_PARSER, UCT_SETBY_DEFAULT); } if (!strcmp(p_out->MIMEname, "x-transparent")) { HTPassEightBitRaw = TRUE; HTAnchor_setUCInfoStage(me->node_anchor, HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_PARSER), UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); } if (p_in->enc != UCT_ENC_CJK) { HTCJK = NOCJK; if (!(p_in->codepoints & UCT_CP_SUBSETOF_LAT1) && chndl == current_char_set) { HTPassEightBitRaw = TRUE; } } else if (p_out->enc == UCT_ENC_CJK) { Set_HTCJK(p_in->MIMEname, p_out->MIMEname); } LYGetChartransInfo(me); /* ** Update the chartrans info homologously to ** a Content-Type MIME header with a charset ** parameter. - FM */ if (me->UCLYhndl != chndl) { HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_MIME, UCT_SETBY_STRUCTURED); HTAnchor_setUCInfoStage(me->node_anchor, chndl, UCT_STAGE_PARSER, UCT_SETBY_STRUCTURED); me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_PARSER); me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_PARSER); } UCSetTransParams(&me->T, me->inUCLYhndl, me->inUCI, me->outUCLYhndl, me->outUCI); } else { /* * Cannot translate. * If according to some heuristic the given * charset and the current display character * both are likely to be like ISO-8859 in * structure, pretend we have some kind * of match. */ BOOL given_is_8859 = (!strncmp(cp4, "iso-8859-", 9) && isdigit((unsigned char)cp4[9])); BOOL given_is_8859like = (given_is_8859 || !strncmp(cp4, "windows-", 8) || !strncmp(cp4, "cp12", 4) || !strncmp(cp4, "cp-12", 5)); BOOL given_and_display_8859like = (given_is_8859like && (strstr(LYchar_set_names[current_char_set], "ISO-8859") || strstr(LYchar_set_names[current_char_set], "windows-"))); if (given_is_8859) { cp1 = &cp4[10]; while (*cp1 && isdigit((unsigned char)(*cp1))) cp1++; *cp1 = '\0'; } if (given_and_display_8859like) { StrAllocCopy(me->node_anchor->charset, cp4); HTPassEightBitRaw = TRUE; } HTAlert(*cp4 ? cp4 : me->node_anchor->charset); } FREE(cp3); if (me->node_anchor->charset) { CTRACE(tfp, "LYHandleMETA: New charset: %s\n", me->node_anchor->charset); } } /* * Set the kcode element based on the charset. - FM */ HText_setKcode(me->text, me->node_anchor->charset, p_in); /* * Check for a Refresh directive. - FM */ } else if (!strcasecomp((http_equiv ? http_equiv : ""), "Refresh")) { char *Seconds = NULL; /* * Look for the Seconds field. - FM */ cp = LYSkipBlanks(content); if (*cp && isdigit(*cp)) { cp1 = cp; while (*cp1 && isdigit(*cp1)) cp1++; if (*cp1) *cp1++ = '\0'; StrAllocCopy(Seconds, cp); } if (Seconds) { /* * We have the seconds field. * Now look for a URL field - FM */ while (*cp1) { if (!strncasecomp(cp1, "URL", 3)) { cp = (cp1 + 3); while (*cp && (*cp == '=' || isspace((unsigned char)*cp))) cp++; cp1 = cp; while (*cp1 && !isspace((unsigned char)*cp1)) cp1++; *cp1 = '\0'; if (*cp) StrAllocCopy(href, cp); break; } cp1++; } if (href) { /* * We found a URL field, so check it out. - FM */ if (!(url_type = LYLegitimizeHREF(me, (char**)&href, TRUE, FALSE))) { /* * The specs require a complete URL, * but this is a Netscapism, so don't * expect the author to know that. - FM */ HTUserMsg(REFRESH_URL_NOT_ABSOLUTE); /* * Use the document's address * as the base. - FM */ if (*href != '\0') { temp = HTParse(href, me->node_anchor->address, PARSE_ALL); StrAllocCopy(href, temp); FREE(temp); } else { StrAllocCopy(href, me->node_anchor->address); HText_setNoCache(me->text); } } /* * Check whether to fill in localhost. - FM */ LYFillLocalFileURL((char **)&href, (me->inBASE ? me->base_href : me->node_anchor->address)); /* * Set the no_cache flag if the Refresh URL * is the same as the document's address. - FM */ if (!strcmp(href, me->node_anchor->address)) { HText_setNoCache(me->text); } } else { /* * We didn't find a URL field, so use * the document's own address and set * the no_cache flag. - FM */ StrAllocCopy(href, me->node_anchor->address); HText_setNoCache(me->text); } /* * Check for an anchor in http or https URLs. - FM */ if ((strncmp(href, "http", 4) == 0) && (cp = strrchr(href, '#')) != NULL) { StrAllocCopy(id_string, cp); *cp = '\0'; } if (me->inA) { /* * Ugh! The META tag, which is a HEAD element, * is in an Anchor, which is BODY element. All * we can do is close the Anchor and cross our * fingers. - FM */ if (me->inBoldA == TRUE && me->inBoldH == FALSE) HText_appendCharacter(me->text, LY_BOLD_END_CHAR); me->inBoldA = FALSE; HText_endAnchor(me->text, me->CurrentANum); me->inA = FALSE; me->CurrentANum = 0; } me->CurrentA = HTAnchor_findChildAndLink( me->node_anchor, /* Parent */ id_string, /* Tag */ href, /* Addresss */ (void *)0); /* Type */ if (id_string) *cp = '#'; FREE(id_string); LYEnsureSingleSpace(me); if (me->inUnderline == FALSE) HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR); HTML_put_string(me, "REFRESH("); HTML_put_string(me, Seconds); HTML_put_string(me, " sec):"); FREE(Seconds); if (me->inUnderline == FALSE) HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR); HTML_put_character(me, ' '); me->in_word = NO; HText_beginAnchor(me->text, me->inUnderline, me->CurrentA); if (me->inBoldH == FALSE) HText_appendCharacter(me->text, LY_BOLD_START_CHAR); HTML_put_string(me, href); FREE(href); if (me->inBoldH == FALSE) HText_appendCharacter(me->text, LY_BOLD_END_CHAR); HText_endAnchor(me->text, 0); LYEnsureSingleSpace(me); } /* * Check for a suggested filename via a Content-Disposition with * a filename=name.suffix in it, if we don't already have it * via a server header. - FM */ } else if (!(me->node_anchor->SugFname && *me->node_anchor->SugFname) && !strcasecomp((http_equiv ? http_equiv : ""), "Content-Disposition")) { cp = content; while (*cp != '\0' && strncasecomp(cp, "filename", 8)) cp++; if (*cp != '\0') { cp += 8; while ((*cp != '\0') && (WHITE(*cp) || *cp == '=')) cp++; while (*cp != '\0' && WHITE(*cp)) cp++; if (*cp != '\0') { StrAllocCopy(me->node_anchor->SugFname, cp); if (*me->node_anchor->SugFname == '\"') { if ((cp = strchr((me->node_anchor->SugFname + 1), '\"')) != NULL) { *(cp + 1) = '\0'; HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname); } else { FREE(me->node_anchor->SugFname); } if (me->node_anchor->SugFname != NULL && *me->node_anchor->SugFname == '\0') { FREE(me->node_anchor->SugFname); } } if ((cp = me->node_anchor->SugFname) != NULL) { while (*cp != '\0' && !WHITE(*cp)) cp++; *cp = '\0'; if (*me->node_anchor->SugFname == '\0') FREE(me->node_anchor->SugFname); } } } /* * Check for a Set-Cookie directive. - AK */ } else if (!strcasecomp((http_equiv ? http_equiv : ""), "Set-Cookie")) { /* * This will need to be updated when Set-Cookie/Set-Cookie2 * handling is finalized. For now, we'll still assume * "historical" cookies in META directives. - FM */ url_type = is_url(me->inBASE ? me->base_href : me->node_anchor->address); if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) { LYSetCookie(content, NULL, (me->inBASE ? me->base_href : me->node_anchor->address)); } } /* * Free the copies. - FM */ free_META_copies: FREE(http_equiv); FREE(name); FREE(content); } /* ** This function handles P elements in HTML streams. ** If start is TRUE it handles a start tag, and if ** FALSE, an end tag. We presently handle start ** and end tags identically, but this can lead to ** a different number of blank lines between the ** current paragraph and subsequent text when a P ** end tag is present or not in the markup. - FM */ PUBLIC void LYHandleP ARGS5( HTStructured *, me, CONST BOOL*, present, CONST char **, value, char **, include GCC_UNUSED, BOOL, start) { if (TRUE) { /* * FIG content should be a true block, which like P inherits * the current style. APPLET is like character elements or * an ALT attribute, unless it content contains a block element. * If we encounter a P in either's content, we set flags to treat * the content as a block. - FM */ if (start) { if (me->inFIG) me->inFIGwithP = TRUE; if (me->inAPPLET) me->inAPPLETwithP = TRUE; } UPDATE_STYLE; if (me->List_Nesting_Level >= 0) { /* * We're in a list. Treat P as an instruction to * create one blank line, if not already present, * then fall through to handle attributes, with * the "second line" margins. - FM */ if (me->inP) { if (me->inFIG || me->inAPPLET || me->inCAPTION || me->inCREDIT || me->sp->style->spaceAfter > 0 || (start && me->sp->style->spaceBefore > 0)) { LYEnsureDoubleSpace(me); } else { LYEnsureSingleSpace(me); } } } else if (me->sp[0].tag_number == HTML_ADDRESS) { /* * We're in an ADDRESS. Treat P as an instruction * to start a newline, if needed, then fall through * to handle attributes. - FM */ if (HText_LastLineSize(me->text, FALSE)) { HText_setLastChar(me->text, ' '); /* absorb white space */ HText_appendCharacter(me->text, '\r'); } } else { if (start) { if (!(me->inLABEL && !me->inP)) { HText_appendParagraph(me->text); } } else if (me->sp->style->spaceAfter > 0) { LYEnsureDoubleSpace(me); } else { LYEnsureSingleSpace(me); } me->inLABEL = FALSE; } me->in_word = NO; if (LYoverride_default_alignment(me)) { me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment; } else if (me->List_Nesting_Level >= 0 || ((me->Division_Level < 0) && (!strcmp(me->sp->style->name, "Normal") || !strcmp(me->sp->style->name, "Preformatted")))) { me->sp->style->alignment = HT_LEFT; } else { me->sp->style->alignment = me->current_default_alignment; } if (start) { if (present && present[HTML_P_ALIGN] && value[HTML_P_ALIGN]) { if (!strcasecomp(value[HTML_P_ALIGN], "center") && !(me->List_Nesting_Level >= 0 && !me->inP)) me->sp->style->alignment = HT_CENTER; else if (!strcasecomp(value[HTML_P_ALIGN], "right") && !(me->List_Nesting_Level >= 0 && !me->inP)) me->sp->style->alignment = HT_RIGHT; else if (!strcasecomp(value[HTML_P_ALIGN], "left") || !strcasecomp(value[HTML_P_ALIGN], "justify")) me->sp->style->alignment = HT_LEFT; } CHECK_ID(HTML_P_ID); } /* * Mark that we are starting a new paragraph * and don't have any of it's text yet. - FM * */ me->inP = FALSE; } return; } /* ** This function handles SELECT elements in HTML streams. ** If start is TRUE it handles a start tag, and if FALSE, ** an end tag. - FM */ PUBLIC void LYHandleSELECT ARGS5( HTStructured *, me, CONST BOOL*, present, CONST char **, value, char **, include GCC_UNUSED, BOOL, start) { int i; if (start == TRUE) { char *name = NULL; BOOLEAN multiple = NO; char *size = NULL; /* * Initialize the disable attribute. */ me->select_disabled = FALSE; /* * Make sure we're in a form. */ if (!me->inFORM) { if (TRACE) { fprintf(tfp, "Bad HTML: SELECT start tag not within FORM tag\n"); } else if (!me->inBadHTML) { HTUserMsg(BAD_HTML_USE_TRACE); me->inBadHTML = TRUE; } /* * We should have covered all crash possibilities with the * current TagSoup parser, so we'll allow it because some * people with other browsers use SELECT for "information" * popups, outside of FORM blocks, though no Lynx user * would do anything that awful, right? - FM *//*** return; ***/ } /* * Check for unclosed TEXTAREA. */ if (me->inTEXTAREA) { if (TRACE) { fprintf(tfp, "Bad HTML: Missing TEXTAREA end tag\n"); } else if (!me->inBadHTML) { HTUserMsg(BAD_HTML_USE_TRACE); me->inBadHTML = TRUE; } } /* * Set to know we are in a select tag. */ me->inSELECT = TRUE; if (!(present && present[HTML_SELECT_NAME] && value[HTML_SELECT_NAME] && *value[HTML_SELECT_NAME])) { StrAllocCopy(name, ""); } else if (strchr(value[HTML_SELECT_NAME], '&') == NULL) { StrAllocCopy(name, value[HTML_SELECT_NAME]); } else { StrAllocCopy(name, value[HTML_SELECT_NAME]); UNESCAPE_FIELDNAME_TO_STD(&name); } if (present && present[HTML_SELECT_MULTIPLE]) multiple=YES; if (present && present[HTML_SELECT_DISABLED]) me->select_disabled = TRUE; if (present && present[HTML_SELECT_SIZE] && value[HTML_SELECT_SIZE] && *value[HTML_SELECT_SIZE]) { #ifdef NOTDEFINED StrAllocCopy(size, value[HTML_SELECT_SIZE]); #else /* * Let the size be determined by the number of OPTIONs. - FM */ CTRACE(tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n", value[HTML_SELECT_SIZE]); #endif /* NOTDEFINED */ } if (me->inBoldH == TRUE && (multiple == NO || LYSelectPopups == FALSE)) { HText_appendCharacter(me->text, LY_BOLD_END_CHAR); me->inBoldH = FALSE; me->needBoldH = TRUE; } if (me->inUnderline == TRUE && (multiple == NO || LYSelectPopups == FALSE)) { HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR); me->inUnderline = FALSE; } if ((multiple == NO && LYSelectPopups == TRUE) && (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE || !me->sp->style->freeFormat) && HText_LastLineSize(me->text, FALSE) > (LYcols - 8)) { /* * Force a newline when we're using a popup in * a PRE block and are within 7 columns from the * right margin. This will allow for the '[' * popup designator and help avoid a wrap in the * underscore placeholder for the retracted popup * entry in the HText structure. - FM */ HTML_put_character(me, '\n'); me->in_word = NO; } LYCheckForID(me, present, value, (int)HTML_SELECT_ID); HText_beginSelect(name, ATTR_CS_IN, multiple, size); FREE(name); FREE(size); me->first_option = TRUE; } else { /* * Handle end tag. */ char *ptr; /* * Make sure we had a select start tag. */ if (!me->inSELECT) { if (TRACE) { fprintf(tfp, "Bad HTML: Unmatched SELECT end tag\n"); } else if (!me->inBadHTML) { HTUserMsg(BAD_HTML_USE_TRACE); me->inBadHTML = TRUE; } return; } /* * Set to know that we are no longer in a select tag. */ me->inSELECT = FALSE; /* * Clear the disable attribute. */ me->select_disabled = FALSE; /* * Finish the data off. */ HTChunkTerminate(&me->option); /* * Finish the previous option. */ ptr = HText_setLastOptionValue(me->text, me->option.data, me->LastOptionValue, LAST_ORDER, me->LastOptionChecked, me->UCLYhndl, ATTR_CS_IN); FREE(me->LastOptionValue); me->LastOptionChecked = FALSE; if (HTCurSelectGroupType == F_CHECKBOX_TYPE || LYSelectPopups == FALSE) { /* * Start a newline after the last checkbox/button option. */ LYEnsureSingleSpace(me); } else { /* * Output popup box with the default option to screen, * but use non-breaking spaces for output. */ if (ptr && me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) { /* * The code inadequately handles OPTION fields in PRE tags. * We'll put up a minimum of 6 characters, and if any * more would exceed the wrap column, we'll ignore them. */ for (i = 0; i < 6; i++) { if (*ptr == ' ') HText_appendCharacter(me->text, HT_NON_BREAK_SPACE); else HText_appendCharacter(me->text, *ptr); ptr++; } HText_setIgnoreExcess(me->text, TRUE); } for (; ptr && *ptr != '\0'; ptr++) { if (*ptr == ' ') HText_appendCharacter(me->text, HT_NON_BREAK_SPACE); else HText_appendCharacter(me->text, *ptr); } /* * Add end option character. */ if (!me->first_option) { HText_appendCharacter(me->text, ']'); HText_setLastChar(me->text, ']'); me->in_word = YES; } HText_setIgnoreExcess(me->text, FALSE); } HTChunkClear(&me->option); if (me->Underline_Level > 0 && me->inUnderline == FALSE) { HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR); me->inUnderline = TRUE; } if (me->needBoldH == TRUE && me->inBoldH == FALSE) { HText_appendCharacter(me->text, LY_BOLD_START_CHAR); me->inBoldH = TRUE; me->needBoldH = FALSE; } } } /* ** This function strips white characters and ** generally fixes up attribute values that ** were received from the SGML parser and ** are to be treated as partial or absolute ** URLs. - FM */ PUBLIC int LYLegitimizeHREF ARGS4( HTStructured *, me, char **, href, BOOL, force_slash, BOOL, strip_dots) { int url_type = 0; char *pound = NULL; char *fragment = NULL; if (!me || !href || *href == NULL || *(*href) == '\0') return(url_type); if (!LYTrimStartfile(*href)) { /* * Collapse spaces in the actual URL, but just * protect against tabs or newlines in the * fragment, if present. This seeks to cope * with atrocities inflicted on the Web by * authoring tools such as Frontpage. - FM */ if ((pound = strchr(*href, '#')) != NULL) { StrAllocCopy(fragment, pound); *pound = '\0'; convert_to_spaces(fragment, FALSE); } LYRemoveBlanks(*href); if (fragment != NULL) { StrAllocCat(*href, fragment); FREE(fragment); } } if (*(*href) == '\0') return(url_type); LYUCFullyTranslateString(href, me->tag_charset, me->tag_charset, NO, NO, YES, st_URL); url_type = is_url(*href); if (!url_type && force_slash && (!strcmp(*href, ".") || !strcmp(*href, "..")) && strncmp((me->inBASE ? me->base_href : me->node_anchor->address), "file:", 5)) { /* * The Fielding RFC/ID for resolving partial HREFs says * that a slash should be on the end of the preceding * symbolic element for "." and "..", but all tested * browsers only do that for an explicit "./" or "../", * so we'll respect the RFC/ID only if force_slash was * TRUE and it's not a file URL. - FM */ StrAllocCat(*href, "/"); } if ((!url_type && LYStripDotDotURLs && strip_dots && *(*href) == '.') && !strncasecomp((me->inBASE ? me->base_href : me->node_anchor->address), "http", 4)) { /* * We will be resolving a partial reference versus an http * or https URL, and it has lead dots, which may be retained * when resolving via HTParse(), but the request would fail * if the first element of the resultant path is two dots, * because no http or https server accepts such paths, and * the current URL draft, likely to become an RFC, says that * it's optional for the UA to strip them as a form of error * recovery. So we will, recursively, for http/https URLs, * like the "major market browsers" which made this problem * so common on the Web, but we'll also issue a message about * it, such that the bad partial reference might get corrected * by the document provider. - FM */ char *temp = NULL, *path = NULL, *cp; CONST char *str = ""; if (((temp = HTParse(*href, (me->inBASE ? me->base_href : me->node_anchor->address), PARSE_ALL)) != NULL && temp[0] != '\0') && (path = HTParse(temp, "", PARSE_PATH+PARSE_PUNCTUATION)) != NULL && !strncmp(path, "/..", 3)) { cp = (path + 3); if (LYIsHtmlSep(*cp) || *cp == '\0') { if ((me->inBASE ? me->base_href[4] : me->node_anchor->address[4]) == 's') { str = "s"; } if (TRACE) { fprintf(tfp, "LYLegitimizeHREF: Bad value '%s' for http%s URL.\n", *href, str); fprintf(tfp, " Stripping lead dots.\n"); } else if (!me->inBadHREF) { HTUserMsg(BAD_PARTIAL_REFERENCE); me->inBadHREF = TRUE; } } if (*cp == '\0') { StrAllocCopy(*href, "/"); } else if (LYIsHtmlSep(*cp)) { while (!strncmp(cp, "/..", 3)) { if (*(cp + 3) == '/') { cp += 3; continue; } else if (*(cp + 3) == '\0') { *(cp + 1) = '\0'; *(cp + 2) = '\0'; } break; } StrAllocCopy(*href, cp); } } FREE(temp); FREE(path); } return(url_type); } /* ** This function checks for a Content-Base header, ** and if not present, a Content-Location header ** which is an absolute URL, and sets the BASE ** accordingly. If set, it will be replaced by ** any BASE tag in the HTML stream, itself. - FM */ PUBLIC void LYCheckForContentBase ARGS1( HTStructured *, me) { char *cp = NULL; BOOL present[HTML_BASE_ATTRIBUTES]; CONST char *value[HTML_BASE_ATTRIBUTES]; int i; if (!(me && me->node_anchor)) return; if (me->node_anchor->content_base != NULL) { /* * We have a Content-Base value. Use it * if it's non-zero length. - FM */ if (*me->node_anchor->content_base == '\0') return; StrAllocCopy(cp, me->node_anchor->content_base); LYRemoveBlanks(cp); } else if (me->node_anchor->content_location != NULL) { /* * We didn't have a Content-Base value, but do * have a Content-Location value. Use it if * it's an absolute URL. - FM */ if (*me->node_anchor->content_location == '\0') return; StrAllocCopy(cp, me->node_anchor->content_location); LYRemoveBlanks(cp); if (!is_url(cp)) { FREE(cp); return; } } else { /* * We had neither a Content-Base nor * Content-Location value. - FM */ return; } /* * If we collapsed to a zero-length value, * ignore it. - FM */ if (*cp == '\0') { FREE(cp); return; } /* * Pass the value to HTML_start_element as * the HREF of a BASE tag. - FM */ for (i = 0; i < HTML_BASE_ATTRIBUTES; i++) present[i] = NO; present[HTML_BASE_HREF] = YES; value[HTML_BASE_HREF] = (CONST char *)cp; (*me->isa->start_element)(me, HTML_BASE, present, value, 0, 0); FREE(cp); } /* ** This function creates NAMEd Anchors if a non-zero-length NAME ** or ID attribute was present in the tag. - FM */ PUBLIC void LYCheckForID ARGS4( HTStructured *, me, CONST BOOL *, present, CONST char **, value, int, attribute) { HTChildAnchor *ID_A = NULL; char *temp = NULL; if (!(me && me->text)) return; if (present && present[attribute] && value[attribute] && *value[attribute]) { /* * Translate any named or numeric character references. - FM */ StrAllocCopy(temp, value[attribute]); LYUCFullyTranslateString(&temp, me->tag_charset, me->tag_charset, NO, NO, YES, st_URL); /* * Create the link if we still have a non-zero-length string. - FM */ if ((temp[0] != '\0') && (ID_A = HTAnchor_findChildAndLink( me->node_anchor, /* Parent */ temp, /* Tag */ NULL, /* Addresss */ (void *)0))) { /* Type */ HText_beginAnchor(me->text, me->inUnderline, ID_A); HText_endAnchor(me->text, 0); } FREE(temp); } } /* ** This function creates a NAMEd Anchor for the ID string ** passed to it directly as an argument. It assumes the ** does not need checking for character references. - FM */ PUBLIC void LYHandleID ARGS2( HTStructured *, me, char *, id) { HTChildAnchor *ID_A = NULL; if (!(me && me->text) || !(id && *id)) return; /* * Create the link if we still have a non-zero-length string. - FM */ if ((ID_A = HTAnchor_findChildAndLink( me->node_anchor, /* Parent */ id, /* Tag */ NULL, /* Addresss */ (void *)0)) != NULL) { /* Type */ HText_beginAnchor(me->text, me->inUnderline, ID_A); HText_endAnchor(me->text, 0); } } /* ** This function checks whether we want to override ** the current default alignment for paragraphs and ** instead use that specified in the element's style ** sheet. - FM */ PUBLIC BOOLEAN LYoverride_default_alignment ARGS1( HTStructured *, me) { if (!me) return NO; switch(me->sp[0].tag_number) { case HTML_BLOCKQUOTE: case HTML_BQ: case HTML_NOTE: case HTML_FN: case HTML_ADDRESS: me->sp->style->alignment = HT_LEFT; return YES; default: break; } return NO; } /* ** This function inserts newlines if needed to create double spacing, ** and sets the left margin for subsequent text to the second line ** indentation of the current style. - FM */ PUBLIC void LYEnsureDoubleSpace ARGS1( HTStructured *, me) { if (!me || !me->text) return; if (HText_LastLineSize(me->text, FALSE)) { HText_setLastChar(me->text, ' '); /* absorb white space */ HText_appendCharacter(me->text, '\r'); HText_appendCharacter(me->text, '\r'); } else if (HText_PreviousLineSize(me->text, FALSE)) { HText_setLastChar(me->text, ' '); /* absorb white space */ HText_appendCharacter(me->text, '\r'); } else if (me->List_Nesting_Level >= 0) { HText_NegateLineOne(me->text); } me->in_word = NO; return; } /* ** This function inserts a newline if needed to create single spacing, ** and sets the left margin for subsequent text to the second line ** indentation of the current style. - FM */ PUBLIC void LYEnsureSingleSpace ARGS1( HTStructured *, me) { if (!me || !me->text) return; if (HText_LastLineSize(me->text, FALSE)) { HText_setLastChar(me->text, ' '); /* absorb white space */ HText_appendCharacter(me->text, '\r'); } else if (me->List_Nesting_Level >= 0) { HText_NegateLineOne(me->text); } me->in_word = NO; return; } /* ** This function resets paragraph alignments for block ** elements which do not have a defined style sheet. - FM */ PUBLIC void LYResetParagraphAlignment ARGS1( HTStructured *, me) { if (!me) return; if (me->List_Nesting_Level >= 0 || ((me->Division_Level < 0) && (!strcmp(me->sp->style->name, "Normal") || !strcmp(me->sp->style->name, "Preformatted")))) { me->sp->style->alignment = HT_LEFT; } else { me->sp->style->alignment = me->current_default_alignment; } return; } /* ** This example function checks whether the given anchor has ** an address with a file scheme, and if so, loads it into the ** the SGML parser's context->url element, which was passed as ** the second argument. The handle_comment() calling function in ** SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup ** into the corresponding stream, homologously to an SSI by an ** HTTP server. - FM ** ** For functions similar to this but which depend on details of ** the HTML handler's internal data, the calling interface should ** be changed, and functions in SGML.c would have to make sure not ** to call such functions inappropriately (e.g., calling a function ** specific to the Lynx_HTML_Handler when SGML.c output goes to ** some other HTStructured object like in HTMLGen.c), or the new ** functions could be added to the SGML.h interface. */ PUBLIC BOOLEAN LYCheckForCSI ARGS2( HTParentAnchor *, anchor, char **, url) { if (!(anchor && anchor->address)) return FALSE; if (strncasecomp(anchor->address, "file:", 5)) return FALSE; if (!LYisLocalHost(anchor->address)) return FALSE; StrAllocCopy(*url, anchor->address); return TRUE; } /* ** This function is called from the SGML parser to look at comments ** and see whether we should collect some info from them. Currently ** it only looks for comments with Message-Id and Subject info, in the ** exact form generated by MHonArc for archived mailing list. If found, ** the info is stored in the document's HTParentAnchor. It can later be ** used for generating a mail response. ** ** We are extra picky here because there isn't any official definition ** for these kinds of comments - we might (and still can) misinterpret ** arbitrary comments as something they aren't. ** ** If something doesn't look right, for example invalid characters, the ** strings are not stored. Mail responses will use something else as ** the subject, probably the document URL, and will not have an ** In-Reply-To header. ** ** All this is a hack - to do this the right way, mailing list archivers ** would have to agree on some better mechanism to make this kind of info ** from original mail headers available, for example using LINK. - kw */ PUBLIC BOOLEAN LYCommentHacks ARGS2( HTParentAnchor *, anchor, CONST char *, comment) { CONST char *cp = comment; size_t len; if (comment == NULL) return FALSE; if (!(anchor && anchor->address)) return FALSE; if (strncmp(comment, "!--X-Message-Id: ", 17) == 0) { char *messageid = NULL; char *p; for (cp = comment+17; *cp; cp++) { if ((unsigned char)*cp >= 127 || !isgraph((unsigned char)*cp)) { break; } } if (strcmp(cp, " --")) { return FALSE; } cp = comment + 17; StrAllocCopy(messageid, cp); /* This should be ok - message-id should only contain 7-bit ASCII */ if (!LYUCFullyTranslateString(&messageid, 0, 0, NO, NO, YES, st_URL)) return FALSE; for (p = messageid; *p; p++) { if ((unsigned char)*p >= 127 || !isgraph((unsigned char)*p)) { break; } } if (strcmp(p, " --")) { FREE(messageid); return FALSE; } if ((p = strchr(messageid, '@')) == NULL || p[1] == '\0') { FREE(messageid); return FALSE; } p = messageid; if ((len = strlen(p)) >= 8 && !strcmp(&p[len-3], " --")) { p[len-3] = '\0'; } else { FREE(messageid); return FALSE; } if (HTAnchor_setMessageID(anchor, messageid)) { FREE(messageid); return TRUE; } else { FREE(messageid); return FALSE; } } if (strncmp(comment, "!--X-Subject: ", 14) == 0) { char *subject = NULL; char *p; for (cp = comment+14; *cp; cp++) { if ((unsigned char)*cp >= 127 || !isprint((unsigned char)*cp)) { return FALSE; } } cp = comment + 14; StrAllocCopy(subject, cp); /* @@@ * This may not be the right thing for the subject - but mail * subjects shouldn't contain 8-bit characters in raw form anyway. * We have to unescape character entities, since that's what MHonArc * seems to generate. But if after that there are 8-bit characters * the string is rejected. We would probably not know correctly * what charset to assume anyway - the mail sender's can differ from * the archive's. And the code for sending mail cannot deal well * with 8-bit characters - we should not put them in the Subject * header in raw form, but don't have MIME encoding implemented. * Someone may want to do more about this... - kw */ if (!LYUCFullyTranslateString(&subject, 0, 0, NO, YES, NO, st_HTML)) return FALSE; for (p = subject; *p; p++) { if ((unsigned char)*p >= 127 || !isprint((unsigned char)*p)) { FREE(subject); return FALSE; } } p = subject; if ((len = strlen(p)) >= 4 && !strcmp(&p[len-3], " --")) { p[len-3] = '\0'; } else { FREE(subject); return FALSE; } if (HTAnchor_setSubject(anchor, subject)) { FREE(subject); return TRUE; } else { FREE(subject); return FALSE; } } return FALSE; }