Ticket #1838: source_codepage autodetect with enca program.

Added support of autodetect codepages via 'enca' program.

New parameter in user configuration file (~/.mc/ini):

[Misc]
autodetect_codeset=(one of `enca --list languages | cut -d : -f1`)

Signed-off-by: Slava Zanko <slavazanko@gmail.com>

Little code cleanup.

Signed-off-by: Ilia Maslakov <il.smind@gmail.com>

Type accuracy. Added missing includes.

Signed-off-by: Andrew Borodin <aborodin@vmail.ru>
This commit is contained in:
Leonid Myravjev 2010-02-05 13:02:47 +02:00 committed by Slava Zanko
parent 110977b161
commit 56d955152c
9 changed files with 173 additions and 77 deletions

View File

@ -7,7 +7,7 @@ CP1251 Windows 1251
CP437 CP 437
CP850 CP 850
CP852 CP 852
CP866 CP 866
IBM866 CP 866
KOI8-R KOI8-R
KOI8-U KOI8-U
UTF-8 UTF-8

View File

@ -334,6 +334,9 @@ edit_cmd (void)
void
edit_cmd_new (void)
{
#if HAVE_CHARSET
source_codepage = default_source_codepage;
#endif
do_edit (NULL);
}

149
src/ext.c
View File

@ -45,10 +45,12 @@
#include "user.h"
#include "main.h"
#include "wtools.h"
#include "ext.h"
#include "execute.h"
#include "history.h"
#include "layout.h"
#include "charsets.h" /* get_codepage_index */
#include "selcodepage.h" /* do_set_codepage */
#include "ext.h"
/* If set, we execute the file command to check the file type */
int use_file_to_check_type = 1;
@ -305,6 +307,45 @@ exec_extension (const char *filename, const char *lc_data, int *move_dir,
# define FILE_CMD "file "
#endif
/*
* Run cmd_file with args, put result into buf.
* If error, put '\0' into buf[0]
* Return 1 if the data is valid, 0 otherwise, -1 for fatal errors.
*
* NOTES: buf is null-terminated string.
*/
static int
get_popen_information (const char *cmd_file, const char *args, char *buf, int buflen)
{
gboolean read_bytes = FALSE;
char *command;
FILE *f;
command = g_strconcat (cmd_file, args, " 2>/dev/null", (char *) NULL);
f = popen (command, "r");
g_free (command);
if (f != NULL) {
#ifdef __QNXNTO__
if (setvbuf (f, NULL, _IOFBF, 0) != 0) {
(void) pclose (f);
return -1;
}
#endif
read_bytes = (fgets (buf, buflen, f) != NULL);
if (!read_bytes)
buf[0] = '\0'; /* Paranoid termination */
pclose (f);
} else {
buf[0] = '\0'; /* Paranoid termination */
return -1;
}
buf[buflen - 1] = '\0';
return read_bytes ? 1 : 0;
}
/*
* Run the "file" command on the local file.
* Return 1 if the data is valid, 0 otherwise, -1 for fatal errors.
@ -312,33 +353,38 @@ exec_extension (const char *filename, const char *lc_data, int *move_dir,
static int
get_file_type_local (const char *filename, char *buf, int buflen)
{
int read_bytes = 0;
char *tmp = name_quote (filename, 0);
char *command = g_strconcat (FILE_CMD, tmp, " 2>/dev/null", (char *) NULL);
FILE *f = popen (command, "r");
char *tmp;
int ret;
tmp = name_quote (filename, 0);
ret = get_popen_information (FILE_CMD, tmp, buf, buflen);
g_free (tmp);
g_free (command);
if (f != NULL) {
#ifdef __QNXNTO__
if (setvbuf (f, NULL, _IOFBF, 0) != 0) {
(void)pclose (f);
return -1;
}
#endif
read_bytes = (fgets (buf, buflen, f)
!= NULL);
if (read_bytes == 0)
buf[0] = 0;
pclose (f);
} else {
return -1;
}
return (read_bytes > 0);
return ret;
}
/*
* Run the "enca" command on the local file.
* Return 1 if the data is valid, 0 otherwise, -1 for fatal errors.
*/
static int
get_file_encoding_local (const char *filename, char *buf, int buflen)
{
char *tmp, *lang, *args;
int ret;
tmp = name_quote (filename, 0);
lang = name_quote (autodetect_codeset, 0);
args= g_strconcat (" -L", lang, " -i ", tmp, (char *) NULL);
ret = get_popen_information ("enca", args, buf, buflen);
g_free (args);
g_free (lang);
g_free (tmp);
return ret;
}
/*
* Invoke the "file" command on the file and match its output against PTR.
@ -353,39 +399,58 @@ regex_check_type (const char *filename, const char *ptr, int *have_type)
/* Following variables are valid if *have_type is 1 */
static char content_string[2048];
static int content_shift = 0;
static char encoding_id[21]; /* CSISO51INISCYRILLIC -- 20 */
static size_t content_shift = 0;
static int got_data = 0;
if (!use_file_to_check_type) {
if (!use_file_to_check_type)
return 0;
}
if (!*have_type) {
if (*have_type == 0) {
char *realname; /* name used with "file" */
char *localfile;
int got_encoding_data;
/* Don't repeate even unsuccessful checks */
*have_type = 1;
localfile = mc_getlocalcopy (filename);
if (!localfile)
if (localfile == NULL)
return -1;
realname = localfile;
got_data =
get_file_type_local (localfile, content_string,
sizeof (content_string));
got_encoding_data = is_autodetect_codeset_enabled
? get_file_encoding_local (localfile, encoding_id, sizeof (encoding_id))
: 0;
mc_ungetlocalcopy (filename, localfile, 0);
if (got_encoding_data > 0) {
char *pp;
int cp_id;
pp = strchr (encoding_id, '\n');
if (pp != NULL)
*pp = '\0';
cp_id = get_codepage_index (encoding_id);
if (cp_id == -1)
cp_id = default_source_codepage;
do_set_codepage (cp_id);
}
got_data = get_file_type_local (localfile, content_string, sizeof (content_string));
if (got_data > 0) {
char *pp;
/* Paranoid termination */
content_string[sizeof (content_string) - 1] = 0;
pp = strchr (content_string, '\n');
if (pp != NULL)
*pp = '\0';
if ((pp = strchr (content_string, '\n')) != 0)
*pp = 0;
if (!strncmp (content_string, realname, strlen (realname))) {
if (strncmp (content_string, realname, strlen (realname)) == 0) {
/* Skip "realname: " */
content_shift = strlen (realname);
if (content_string[content_shift] == ':') {
@ -393,21 +458,21 @@ regex_check_type (const char *filename, const char *ptr, int *have_type)
for (content_shift++;
content_string[content_shift] == ' '
|| content_string[content_shift] == '\t';
content_shift++);
content_shift++)
;
}
}
} else {
/* No data */
content_string[0] = 0;
content_string[0] = '\0';
}
g_free (realname);
}
if (got_data == -1) {
if (got_data == -1)
return -1;
}
if (content_string[0]
if (content_string[0] != '\0'
&& mc_search (ptr, content_string + content_shift, MC_SEARCH_T_REGEX)) {
found = 1;
}

View File

@ -2271,6 +2271,7 @@ main (int argc, char *argv[])
done_key ();
#ifdef HAVE_CHARSET
free_codepages_list ();
g_free (autodetect_codeset);
#endif
str_uninit_strings ();

View File

@ -44,7 +44,10 @@ extern int option_tab_spacing;
#ifdef HAVE_CHARSET
extern int source_codepage;
extern int default_source_codepage;
extern int display_codepage;
extern char* autodetect_codeset;
extern gboolean is_autodetect_codeset_enabled;
#else
extern int eight_bit_clean;
extern int full_eight_bits;

View File

@ -2346,6 +2346,10 @@ do_enter_on_file_entry (file_entry *fe)
g_free (cmd);
}
#if HAVE_CHARSET
source_codepage = default_source_codepage;
#endif
return 1;
}
@ -3140,7 +3144,7 @@ set_panel_encoding (WPanel *panel)
const char *errmsg;
int r;
r = select_charset (-1, -1, source_codepage, FALSE);
r = select_charset (-1, -1, default_source_codepage, FALSE);
if (r == SELECT_CHARSET_CANCEL)
return; /* Cancel */

View File

@ -40,7 +40,10 @@
/* Numbers of (file I/O) and (input/display) codepages. -1 if not selected */
int source_codepage = -1;
int default_source_codepage = -1;
int display_codepage = -1;
char* autodetect_codeset = NULL;
gboolean is_autodetect_codeset_enabled = FALSE;
static unsigned char
get_hotkey (int n)
@ -108,19 +111,14 @@ select_charset (int center_y, int center_x, int current_charset, gboolean seldis
}
}
/* Set codepage */
gboolean
do_select_codepage (void)
do_set_codepage (int codepage)
{
const char *errmsg = NULL;
int r;
r = select_charset (-1, -1, source_codepage, FALSE);
if (r == SELECT_CHARSET_CANCEL)
return FALSE;
source_codepage = r;
errmsg = init_translation_table (r == SELECT_CHARSET_NO_TRANSLATE ?
source_codepage = codepage;
errmsg = init_translation_table (codepage == SELECT_CHARSET_NO_TRANSLATE ?
display_codepage : source_codepage,
display_codepage);
if (errmsg != NULL)
@ -129,4 +127,19 @@ do_select_codepage (void)
return (errmsg == NULL);
}
/* Show menu selecting codepage */
gboolean
do_select_codepage (void)
{
int r;
r = select_charset (-1, -1, default_source_codepage, FALSE);
if (r == SELECT_CHARSET_CANCEL)
return FALSE;
default_source_codepage = r;
return do_set_codepage (default_source_codepage);
}
#endif /* HAVE_CHARSET */

View File

@ -11,6 +11,7 @@
#include "lib/global.h"
int select_charset (int center_y, int center_x, int current_charset, gboolean seldisplay);
gboolean do_set_codepage (int);
gboolean do_select_codepage (void);
/* some results of select_charset() */

View File

@ -390,10 +390,12 @@ save_setup (void)
#endif /* ENABLE_VFS && USE_NETCODE */
#ifdef HAVE_CHARSET
mc_config_set_string(mc_main_config, "Misc" , "display_codepage",
mc_config_set_string (mc_main_config, "Misc" , "display_codepage",
get_codepage_id( display_codepage ));
mc_config_set_string(mc_main_config, "Misc" , "source_codepage",
get_codepage_id( source_codepage ));
mc_config_set_string (mc_main_config, "Misc" , "source_codepage",
get_codepage_id( default_source_codepage ));
mc_config_set_string (mc_main_config, "Misc" , "autodetect_codeset",
autodetect_codeset );
#endif /* HAVE_CHARSET */
tmp_profile = g_build_filename (home_dir, MC_USERCONF_DIR, MC_CONFIG_FILE, NULL);
ret = mc_config_save_to_file (mc_main_config, tmp_profile, NULL);
@ -739,19 +741,19 @@ load_setup (void)
/* mc.lib is common for all users, but has priority lower than
~/.mc/ini. FIXME: it's only used for keys and treestore now */
global_profile_name = concat_dir_and_file (mc_home, MC_GLOBAL_CONFIG_FILE);
if (!exist_file(global_profile_name)) {
if (!exist_file (global_profile_name)) {
g_free (global_profile_name);
global_profile_name = concat_dir_and_file (mc_home_alt, MC_GLOBAL_CONFIG_FILE);
}
panels_profile_name = g_build_filename (home_dir, MC_USERCONF_DIR, MC_PANELS_FILE, NULL);
mc_main_config = mc_config_init(profile);
mc_main_config = mc_config_init (profile);
if (!exist_file(panels_profile_name))
setup__move_panels_config_into_separate_file(profile);
setup__move_panels_config_into_separate_file (profile);
mc_panels_config = mc_config_init(panels_profile_name);
mc_panels_config = mc_config_init (panels_profile_name);
/* Load integer boolean options */
for (i = 0; int_options[i].opt_name; i++)
@ -775,7 +777,7 @@ load_setup (void)
startup_left_mode = view_listing;
if (!other_dir){
buffer = mc_config_get_string(mc_panels_config, "Dirs", "other_dir", ".");
buffer = mc_config_get_string (mc_panels_config, "Dirs", "other_dir", ".");
if (vfs_file_is_local (buffer))
other_dir = buffer;
else
@ -783,16 +785,16 @@ load_setup (void)
}
boot_current_is_left =
mc_config_get_int(mc_panels_config, "Dirs", "current_is_left", 1);
mc_config_get_int (mc_panels_config, "Dirs", "current_is_left", 1);
#ifdef USE_NETCODE
ftpfs_proxy_host = mc_config_get_string(mc_main_config, "Misc", "ftp_proxy_host", "gate");
ftpfs_proxy_host = mc_config_get_string (mc_main_config, "Misc", "ftp_proxy_host", "gate");
#endif
/* The default color and the terminal dependent color */
setup_color_string = mc_config_get_string(mc_main_config, "Colors", "base_color", "");
term_color_string = mc_config_get_string(mc_main_config, "Colors", getenv ("TERM"), "");
color_terminal_string = mc_config_get_string(mc_main_config, "Colors", "color_terminals", "");
setup_color_string = mc_config_get_string (mc_main_config, "Colors", "base_color", "");
term_color_string = mc_config_get_string (mc_main_config, "Colors", getenv ("TERM"), "");
color_terminal_string = mc_config_get_string (mc_main_config, "Colors", "color_terminals", "");
/* Load the directory history */
/* directory_history_load (); */
@ -802,25 +804,29 @@ load_setup (void)
#endif /* ENABLE_VFS && USE_NETCODE */
#ifdef HAVE_CHARSET
if ( load_codepages_list() > 0 ) {
buffer = mc_config_get_string(mc_main_config, "Misc", "display_codepage", "");
if ( buffer[0] != '\0' )
{
display_codepage = get_codepage_index( buffer );
if (load_codepages_list () > 0) {
buffer = mc_config_get_string (mc_main_config, "Misc", "display_codepage", "");
if (buffer[0] != '\0') {
display_codepage = get_codepage_index (buffer);
cp_display = get_codepage_id (display_codepage);
}
g_free(buffer);
buffer = mc_config_get_string(mc_main_config, "Misc", "source_codepage", "");
if ( buffer[0] != '\0' )
{
source_codepage = get_codepage_index( buffer );
if (buffer[0] != '\0') {
default_source_codepage = get_codepage_index (buffer);
source_codepage = default_source_codepage; /* May be source_codepage don't needed this */
cp_source = get_codepage_id (source_codepage);
}
g_free(buffer);
}
init_translation_table( source_codepage, display_codepage );
if ( get_codepage_id( display_codepage ) )
utf8_display = str_isutf8 (get_codepage_id( display_codepage ));
autodetect_codeset = mc_config_get_string (mc_main_config, "Misc", "autodetect_codeset", "");
if ((autodetect_codeset[0] != '\0') && (strcmp(autodetect_codeset, "off")))
is_autodetect_codeset_enabled=TRUE;
init_translation_table (source_codepage, display_codepage);
if (get_codepage_id (display_codepage))
utf8_display = str_isutf8 (get_codepage_id (display_codepage));
#endif /* HAVE_CHARSET */
}