Fix STR #2348. Files encoded with UTF-8 or CP1252 are accepted. Any non-UTF-8-encoded
data is interpreted according to CP1252 and transcoded to UTF-8. By default, a warning message is displayed when the input file was transcoded. This default behavior can be modified by changing a function pointer. A flag has been added to the Fl_Text_Buffer object that informs the caller if the input file was transcoded to UTF-8. The Fl_Text_Buffer.cxx file contains some preliminary code that could be used in the future to input other encodings provided they are fixed-length (e.g., all ISO-8859-* character sets, UTF-16). This code is not compiled at this point. git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@8004 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
This commit is contained in:
parent
395f5a70aa
commit
85a03a76c9
@ -300,20 +300,22 @@ public:
|
||||
non-zero on error (strerror() contains reason). 1 indicates open
|
||||
for read failed (no data loaded). 2 indicates error occurred
|
||||
while reading data (data was partially loaded).
|
||||
File can be UTF-8 or CP1252-encoded.
|
||||
If the input file is not UTF-8-encoded, the Fl_Text_Buffer widget will contain
|
||||
UTF-8-recoded data. By default, the message Fl_Text_Buffer::file_encoding_warning_message
|
||||
will warn the user about this.
|
||||
\see input_file_was_reencoded and transcoding_warning_action.
|
||||
*/
|
||||
int insertfile(const char *file, int pos, int buflen = 128*1024);
|
||||
|
||||
/**
|
||||
Appends the named file to the end of the buffer. Returns 0 on
|
||||
success, non-zero on error (strerror() contains reason). 1 indicates
|
||||
open for read failed (no data loaded). 2 indicates error occurred
|
||||
while reading data (data was partially loaded).
|
||||
Appends the named file to the end of the buffer. See also insertfile().
|
||||
*/
|
||||
int appendfile(const char *file, int buflen = 128*1024)
|
||||
{ return insertfile(file, length(), buflen); }
|
||||
|
||||
/**
|
||||
Loads a text file into the buffer
|
||||
Loads a text file into the buffer. See also insertfile().
|
||||
*/
|
||||
int loadfile(const char *file, int buflen = 128*1024)
|
||||
{ select(0, length()); remove_selection(); return appendfile(file, buflen); }
|
||||
@ -669,6 +671,27 @@ public:
|
||||
*/
|
||||
int utf8_align(int) const;
|
||||
|
||||
/**
|
||||
\brief true iff the loaded file has been re-encoded to UTF-8
|
||||
*/
|
||||
int input_file_was_reencoded;
|
||||
|
||||
/** This message may be displayed using the fl_alert() function when a file
|
||||
which was not UTF-8 encoded is input.
|
||||
*/
|
||||
static const char* file_encoding_warning_message;
|
||||
|
||||
/**
|
||||
\brief Pointer to a function called after reading a non UTF-8 encoded file.
|
||||
|
||||
This function is called after reading a file if the file content
|
||||
was re-encoded to UTF-8. Its default implementation calls fl_alert()
|
||||
with the text of \ref file_encoding_warning_message. No warning message is
|
||||
displayed if this pointer is set to NULL. Use \ref input_file_was_reencoded
|
||||
to be programmatically informed if file input required re-encoding to UTF-8.
|
||||
*/
|
||||
void (*transcoding_warning_action)(Fl_Text_Buffer*);
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include <ctype.h>
|
||||
#include <FL/Fl.H>
|
||||
#include <FL/Fl_Text_Buffer.H>
|
||||
#include <FL/fl_ask.H>
|
||||
|
||||
|
||||
/*
|
||||
@ -102,6 +103,10 @@ static void undobuffersize(int n)
|
||||
}
|
||||
}
|
||||
|
||||
static void def_transcoding_warning_action(Fl_Text_Buffer *text)
|
||||
{
|
||||
fl_alert(text->file_encoding_warning_message);
|
||||
}
|
||||
|
||||
/*
|
||||
Initialize all variables.
|
||||
@ -128,6 +133,8 @@ Fl_Text_Buffer::Fl_Text_Buffer(int requestedSize, int preferredGapSize)
|
||||
mPredeleteCbArgs = NULL;
|
||||
mCursorPosHint = 0;
|
||||
mCanUndo = 1;
|
||||
input_file_was_reencoded = 0;
|
||||
transcoding_warning_action = def_transcoding_warning_action;
|
||||
}
|
||||
|
||||
|
||||
@ -1513,30 +1520,174 @@ int Fl_Text_Buffer::findchar_backward(int startPos, unsigned int searchChar,
|
||||
return 0;
|
||||
}
|
||||
|
||||
//#define FIXED_LENGTH_ENCODING // shows how to process any fixed-length encoding
|
||||
#ifdef FIXED_LENGTH_ENCODING
|
||||
|
||||
// returns the UCS equivalent of *p in CP1252 and advances p by 1
|
||||
unsigned cp1252toucs(char* &p)
|
||||
{
|
||||
// Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
|
||||
// to Unicode
|
||||
static unsigned cp1252[32] = {
|
||||
0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
|
||||
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
|
||||
0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
|
||||
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
|
||||
};
|
||||
unsigned char uc = *(unsigned char*)p;
|
||||
p++;
|
||||
return (uc < 0x80 || uc >= 0xa0 ? uc : cp1252[uc - 0x80]);
|
||||
}
|
||||
|
||||
// returns the UCS equivalent of *p in UTF-16 and advances p by 2
|
||||
unsigned utf16toucs(char* &p)
|
||||
{
|
||||
union {
|
||||
#if WORDS_BIGENDIAN
|
||||
struct { unsigned char a, b;} chars;
|
||||
#else
|
||||
struct { unsigned char b, a;} chars;
|
||||
#endif
|
||||
U16 short_val;
|
||||
} u;
|
||||
u.chars.a = *(unsigned char*)p++;
|
||||
u.chars.b = *(unsigned char*)p++;
|
||||
return u.short_val;
|
||||
}
|
||||
|
||||
// filter that produces, from an input stream fed by reading from fp,
|
||||
// a UTF-8-encoded output stream written in buffer.
|
||||
// Input can be any fixed-length (e.g., 8-bit, UTF-16) encoding.
|
||||
// Output is true UTF-8.
|
||||
// p_trf points to a function that transforms encoded byte(s) into UCS
|
||||
// and that increases the pointer by the adequate quantity
|
||||
static int fixed_length_input_filter(char *buffer, int buflen,
|
||||
char *line, int sline, char* &endline,
|
||||
unsigned (*p_trf)(char* &),
|
||||
FILE *fp)
|
||||
{
|
||||
char *p, *q, multibyte[5];
|
||||
int lq, r, offset;
|
||||
p = endline = line;
|
||||
q = buffer;
|
||||
while (q < buffer + buflen) {
|
||||
if (p >= endline) {
|
||||
r = fread(line, 1, sline, fp);
|
||||
endline = line + r;
|
||||
if (r == 0) return q - buffer;
|
||||
p = line;
|
||||
}
|
||||
if (q + 4 /*max width of utf-8 char*/ > buffer + buflen) {
|
||||
memmove(line, p, endline - p);
|
||||
endline -= (p - line);
|
||||
return q - buffer;
|
||||
}
|
||||
lq = fl_utf8encode( p_trf(p), multibyte );
|
||||
memcpy(q, multibyte, lq);
|
||||
q += lq;
|
||||
}
|
||||
memmove(line, p, endline - p);
|
||||
endline -= (p - line);
|
||||
return q - buffer;
|
||||
}
|
||||
#endif // FIXED_LENGTH_ENCODING
|
||||
|
||||
/*
|
||||
filter that produces, from an input stream fed by reading from fp,
|
||||
a UTF-8-encoded output stream written in buffer.
|
||||
Input can be UTF-8. If it is not, it is decoded with CP1252.
|
||||
Output is UTF-8.
|
||||
*input_was_changed is set to true if the input was not strict UTF-8 so output
|
||||
differs from input.
|
||||
*/
|
||||
static int utf8_input_filter(char *buffer, int buflen, char *line, int sline, char* &endline,
|
||||
FILE *fp, int *input_was_changed)
|
||||
{
|
||||
char *p, *q, multibyte[5];
|
||||
int l, lp, lq, r;
|
||||
unsigned u;
|
||||
p = endline = line;
|
||||
q = buffer;
|
||||
while (q < buffer + buflen) {
|
||||
if (p >= endline) {
|
||||
r = fread(line, 1, sline, fp);
|
||||
endline = line + r;
|
||||
if (r == 0) return q - buffer;
|
||||
p = line;
|
||||
}
|
||||
l = fl_utf8len1(*p);
|
||||
if (p + l > endline) {
|
||||
memmove(line, p, endline - p);
|
||||
endline -= (p - line);
|
||||
r = fread(endline, 1, sline - (endline - line), fp);
|
||||
endline += r;
|
||||
p = line;
|
||||
if (endline - line < l) break;
|
||||
}
|
||||
while ( l > 0) {
|
||||
u = fl_utf8decode(p, p+l, &lp);
|
||||
lq = fl_utf8encode(u, multibyte);
|
||||
if (lp != l || lq != l) *input_was_changed = true;
|
||||
if (q + lq > buffer + buflen) {
|
||||
memmove(line, p, endline - p);
|
||||
endline -= (p - line);
|
||||
return q - buffer;
|
||||
}
|
||||
memcpy(q, multibyte, lq);
|
||||
q += lq;
|
||||
p += lp;
|
||||
l -= lp;
|
||||
}
|
||||
}
|
||||
memmove(line, p, endline - p);
|
||||
endline -= (p - line);
|
||||
return q - buffer;
|
||||
}
|
||||
|
||||
const char *Fl_Text_Buffer::file_encoding_warning_message =
|
||||
"Displayed text contains the UTF-8 re-encoding\n"
|
||||
"of the input file which was not UTF-8 encoded.\n"
|
||||
"Some changes may have occurred.";
|
||||
|
||||
/*
|
||||
Insert text from a file.
|
||||
Unicode safe. Input must be correct UTF-8!
|
||||
Input file can be of various encodings according to what input fiter is used.
|
||||
utf8_input_filter accepts UTF-8 or CP1252 as input encoding.
|
||||
Output is always UTF-8.
|
||||
*/
|
||||
int Fl_Text_Buffer::insertfile(const char *file, int pos, int /*buflen*/) {
|
||||
int Fl_Text_Buffer::insertfile(const char *file, int pos, int buflen)
|
||||
{
|
||||
FILE *fp;
|
||||
if (!(fp = fl_fopen(file, "r")))
|
||||
return 1;
|
||||
fseek(fp, 0, SEEK_END);
|
||||
size_t filesize = ftell(fp);
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
if (!filesize) return 0;
|
||||
char *buffer = new char[filesize+1];
|
||||
// Note: If we read Windows text files in text mode, then Windows
|
||||
// strips the <CR>'s from the text. Hence, rsize < filesize !
|
||||
size_t rsize = fread(buffer, 1, filesize, fp);
|
||||
if (rsize > 0) {
|
||||
buffer[rsize] = (char) 0;
|
||||
char *buffer = new char[buflen + 1];
|
||||
char *endline, line[100];
|
||||
int l;
|
||||
input_file_was_reencoded = false;
|
||||
endline = line;
|
||||
while (true) {
|
||||
#ifdef FIXED_LENGTH_ENCODING
|
||||
// example of 16-bit encoding: UTF-16
|
||||
l = fixed_length_input_filter(buffer, buflen,
|
||||
line, sizeof(line), endline,
|
||||
utf16toucs, // use cp1252toucs to read CP1252-encoded files
|
||||
fp);
|
||||
input_file_was_reencoded = true;
|
||||
#else
|
||||
l = utf8_input_filter(buffer, buflen, line, sizeof(line), endline,
|
||||
fp, &input_file_was_reencoded);
|
||||
#endif
|
||||
if (l == 0) break;
|
||||
buffer[l] = 0;
|
||||
insert(pos, buffer);
|
||||
}
|
||||
pos += l;
|
||||
}
|
||||
int e = ferror(fp) ? 2 : 0;
|
||||
fclose(fp);
|
||||
delete[]buffer;
|
||||
if ( (!e) && input_file_was_reencoded && transcoding_warning_action) {
|
||||
transcoding_warning_action(this);
|
||||
}
|
||||
return e;
|
||||
}
|
||||
|
||||
|
@ -492,6 +492,7 @@ void load_file(const char *newfile, int ipos) {
|
||||
int r;
|
||||
if (!insert) r = textbuf->loadfile(newfile);
|
||||
else r = textbuf->insertfile(newfile, ipos);
|
||||
changed = changed || textbuf->input_file_was_reencoded;
|
||||
if (r)
|
||||
fl_alert("Error reading from file \'%s\':\n%s.", newfile, strerror(errno));
|
||||
else
|
||||
@ -795,6 +796,7 @@ Fl_Window* new_view() {
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
textbuf = new Fl_Text_Buffer;
|
||||
//textbuf->transcoding_warning_action = NULL;
|
||||
style_init();
|
||||
|
||||
Fl_Window* window = new_view();
|
||||
|
Loading…
Reference in New Issue
Block a user