1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
|
/*
* str_conv.c
*
* Copyright (c) Chris Putnam 1999-2021
*
* Source code released under the GPL version 2
*
* str routines for converting strs between character sets
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include "latex.h"
#include "entities.h"
#include "utf8.h"
#include "gb18030.h"
#include "charsets.h"
#include "str_conv.h"
static void
addentity( str *s, unsigned int ch )
{
char buf[512];
sprintf( buf, "&#%u;", ch );
str_strcatc( s, buf );
}
/* These are the five minimal predefined entites in XML */
static int
minimalxmlchars( str *s, unsigned int ch )
{
if ( ch==34 ) { str_strcatc( s, """ ); return 1; }
else if ( ch==38 ) { str_strcatc( s, "&" ); return 1; }
else if ( ch==39 ) { str_strcatc( s, "'" ); return 1; }
else if ( ch==60 ) { str_strcatc( s, "<" ); return 1; }
else if ( ch==62 ) { str_strcatc( s, ">" ); return 1; }
return 0;
}
static void
addxmlchar( str *s, unsigned int ch )
{
if ( minimalxmlchars( s, ch ) ) return;
if ( ch > 127 ) addentity( s, ch );
else str_addchar( s, ch );
}
static void
addutf8char( str *s, unsigned int ch, int xmlout )
{
unsigned char code[6];
int nc, i;
if ( xmlout ) {
if ( minimalxmlchars( s, ch ) ) return;
if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
{ addentity( s, ch ); return; }
}
nc = utf8_encode( ch, code );
for ( i=0; i<nc; ++i )
str_addchar( s, code[i] );
}
static void
addgb18030char( str *s, unsigned int ch, int xmlout )
{
unsigned char code[4];
int nc, i;
if ( xmlout ) {
if ( minimalxmlchars( s, ch ) ) return;
if ( ch > 127 && xmlout == STR_CONV_XMLOUT_ENTITIES )
{ addentity( s, ch ); return; }
}
nc = gb18030_encode( ch, code );
for ( i=0; i<nc; ++i )
str_addchar( s, code[i] );
}
static void
addlatexchar( str *s, unsigned int ch, int xmlout, int utf8out )
{
char buf[512];
uni2latex( ch, buf, sizeof( buf ) );
/* If the unicode character isn't recognized as latex output
* a '?' unless the user has requested unicode output. If so,
* output the unicode.
*/
if ( utf8out && !strcmp( buf, "?" ) ) {
addutf8char( s, ch, xmlout );
} else {
str_strcatc( s, buf );
}
}
/*
* get_unicode()
*
* This can be a little tricky. If the character is simply encoded
* such as UTF8 for > 128 or by numeric xml entities such as "Ȗ"
* then the output of decode_entity() and utf8_decode will necessarily
* be in the charsetin character set. On the other hand, if it's a
* fancy latex expression, such as "\alpha", or a non-numeric xml entity
* like "&", then we'll get the Unicode value (because our lists only
* keep the Unicode equivalent).
*
* The unicode variable indicates whether or not a Unicode-based listing
* was used to convert the character (remember that charsetin could be
* Unicode independently).
*
* The charset variable is used to keep track of what character set
* the character is in prior to conversion.
*
*/
static unsigned int
get_unicode( str *s, unsigned int *pi, int charsetin, int latexin, int utf8in, int xmlin )
{
unsigned int ch;
int unicode = 0, err = 0;
if ( xmlin && s->data[*pi]=='&' ) {
ch = decode_entity( s->data, pi, &unicode, &err );
} else if ( charsetin==CHARSET_GB18030 ) {
ch = gb18030_decode( s->data, pi );
unicode = 1;
} else if ( latexin ) {
/* Must handle bibtex files in UTF8/Unicode */
if ( utf8in && ( s->data[*pi] & 128 ) ) {
ch = utf8_decode( s->data, pi );
unicode = 1;
} else ch = latex2char( s->data, pi, &unicode );
}
else if ( utf8in )
ch = utf8_decode( s->data, pi );
else {
ch = (unsigned int) s->data[*pi];
*pi = *pi + 1;
}
if ( !unicode && charsetin!=CHARSET_UNICODE )
ch = charset_lookupchar( charsetin, ch );
return ch;
}
static int
write_unicode( str *s, unsigned int ch, int charsetout, int latexout,
int utf8out, int xmlout )
{
unsigned int c;
if ( latexout ) {
addlatexchar( s, ch, xmlout, utf8out );
} else if ( utf8out ) {
addutf8char( s, ch, xmlout );
} else if ( charsetout==CHARSET_GB18030 ) {
addgb18030char( s, ch, xmlout );
} else {
c = charset_lookupuni( charsetout, ch );
if ( xmlout ) addxmlchar( s, c );
else str_addchar( s, c );
}
return 1;
}
/*
* Returns 1 on memory error condition
*/
int
str_convert( str *s,
int charsetin, int latexin, int utf8in, int xmlin,
int charsetout, int latexout, int utf8out, int xmlout )
{
unsigned int pos = 0;
unsigned int ch;
str ns;
int ok = 1;
if ( !s || s->len==0 ) return ok;
/* Ensure that string is internally allocated.
* This fixes NULL pointer derefernce in CVE-2018-10775 in bibutils
* as a string with a valid data pointer is potentially replaced
* by a string without a valid data pointer due to it being invalid
* unicode.
* This probably also fixes CVE-2018-10773 and CVE-2018-10774 which
* are NULL dereferences also likely due to a fuzzer, but without
* test cases in the report, I can't be completely sure.
*/
str_initstrc( &ns, "" );
if ( charsetin==CHARSET_UNKNOWN ) charsetin = CHARSET_DEFAULT;
if ( charsetout==CHARSET_UNKNOWN ) charsetout = CHARSET_DEFAULT;
while ( s->data[pos] ) {
ch = get_unicode( s, &pos, charsetin, latexin, utf8in, xmlin );
ok = write_unicode( &ns, ch, charsetout, latexout, utf8out, xmlout );
if ( !ok ) goto out;
}
str_swapstrings( s, &ns );
out:
str_free( &ns );
return ok;
}
|