mc/lib/search/hex.c

235 lines
7.8 KiB
C
Raw Normal View History

/*
Search text engine.
HEX-style pattern matching
Copyright (C) 2009-2024
Free Software Foundation, Inc.
Written by:
Slava Zanko <slavazanko@gmail.com>, 2009.
This file is part of the Midnight Commander.
The Midnight Commander is free software: you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the License,
or (at your option) any later version.
The Midnight Commander is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <stdio.h>
#include "lib/global.h"
#include "lib/strutil.h"
#include "lib/search.h"
#include "internal.h"
/*** global variables ****************************************************************************/
/*** file scope macro definitions ****************************************************************/
typedef enum
{
MC_SEARCH_HEX_E_OK,
MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
MC_SEARCH_HEX_E_INVALID_CHARACTER,
MC_SEARCH_HEX_E_UNMATCHED_QUOTES
} mc_search_hex_parse_error_t;
/*** file scope type declarations ****************************************************************/
/*** forward declarations (file scope functions) *************************************************/
/*** file scope variables ************************************************************************/
/* --------------------------------------------------------------------------------------------- */
/*** file scope functions ************************************************************************/
/* --------------------------------------------------------------------------------------------- */
static GString *
mc_search__hex_translate_to_regex (const GString *astr, mc_search_hex_parse_error_t *error_ptr,
int *error_pos_ptr)
{
GString *buff;
const char *str;
gsize str_len;
gsize loop = 0;
mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
buff = g_string_sized_new (64);
str = astr->str;
str_len = astr->len;
while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
{
unsigned int val;
int ptr;
if (g_ascii_isspace (str[loop]))
{
/* Eat-up whitespace between tokens. */
while (g_ascii_isspace (str[loop]))
loop++;
}
/* cppcheck-suppress invalidscanf */
else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
{
if (val > 255)
error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
else
{
g_string_append_printf (buff, "\\x%02X", val);
loop += ptr;
}
}
else if (str[loop] == '"')
{
gsize loop2;
loop2 = loop + 1;
while (loop2 < str_len)
{
if (str[loop2] == '"')
break;
if (str[loop2] == '\\' && loop2 + 1 < str_len)
loop2++;
g_string_append_c (buff, str[loop2]);
loop2++;
}
if (str[loop2] == '\0')
error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
else
loop = loop2 + 1;
}
else
error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
}
if (error != MC_SEARCH_HEX_E_OK)
{
g_string_free (buff, TRUE);
if (error_ptr != NULL)
*error_ptr = error;
if (error_pos_ptr != NULL)
*error_pos_ptr = loop;
return NULL;
}
return buff;
}
/* --------------------------------------------------------------------------------------------- */
/*** public functions ****************************************************************************/
/* --------------------------------------------------------------------------------------------- */
void
mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t *lc_mc_search,
mc_search_cond_t *mc_search_cond)
{
GString *tmp;
mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
int error_pos = 0;
/*
* We may be searching in binary data, which is often invalid UTF-8.
*
* We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
* the data is invalid UTF-8, both GLib's PCRE and our
* mc_search__g_regex_match_full_safe() are going to fail us. The former by
* not finding all bytes, the latter by overwriting the supposedly invalid
* UTF-8 with NULs.
*
* To do this, we specify "ASCII" as the charset.
*
* In fact, we can specify any charset other than "UTF-8": any such charset
* will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
* for all charsets because it skips the \xXX symbols
* mc_search__hex_translate_to_regex() outputs.
*
* But "ASCII" is the best choice because a hex pattern may contain a
* quoted string: this way we know [2] will ignore any characters outside
* ASCII letters range (these ignored chars will be copied verbatim to the
* output and will match as-is; in other words, in a case-sensitive manner;
* If the user is interested in case-insensitive searches of international
* text, he shouldn't be using hex search in the first place.)
*
* Switching out of UTF-8 has another advantage:
*
* When doing case-insensitive searches, GLib treats \xXX symbols as normal
* letters and therefore matches both "a" and "A" for the hex pattern
* "0x61". When we switch out of UTF-8, we're switching to using [2], which
* doesn't have this issue.
*
* [1] mc_search__cond_struct_new_init_regex
* [2] mc_search__cond_struct_new_regex_ci_str
*/
if (str_isutf8 (charset))
charset = "ASCII";
tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
if (tmp != NULL)
{
g_string_free (mc_search_cond->str, TRUE);
mc_search_cond->str = tmp;
mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
}
else
{
const char *desc;
switch (error)
{
case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
desc =
_
("Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
break;
case MC_SEARCH_HEX_E_INVALID_CHARACTER:
desc = _("Invalid character");
break;
case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
desc = _("Unmatched quotes character");
break;
default:
desc = "";
}
lc_mc_search->error = MC_SEARCH_E_INPUT;
lc_mc_search->error_str =
g_strdup_printf (_("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
}
}
/* --------------------------------------------------------------------------------------------- */
gboolean
mc_search__run_hex (mc_search_t *lc_mc_search, const void *user_data,
gsize start_search, gsize end_search, gsize *found_len)
{
return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
}
/* --------------------------------------------------------------------------------------------- */
GString *
mc_search_hex_prepare_replace_str (mc_search_t *lc_mc_search, GString *replace_str)
{
(void) lc_mc_search;
return mc_g_string_dup (replace_str);
}
/* --------------------------------------------------------------------------------------------- */