mc/lib/search/hex.c

/*
   Search text engine.
   HEX-style pattern matching

   Copyright (C) 2009-2016
   Free Software Foundation, Inc.

   Written by:
   Slava Zanko <slavazanko@gmail.com>, 2009.

   This file is part of the Midnight Commander.

   The Midnight Commander is free software: you can redistribute it
   and/or modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation, either version 3 of the License,
   or (at your option) any later version.

   The Midnight Commander is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <config.h>

#include <stdio.h>

#include "lib/global.h"
#include "lib/strutil.h"
#include "lib/search.h"
#include "lib/strescape.h"

#include "internal.h"

/*** global variables ****************************************************************************/

/*** file scope macro definitions ****************************************************************/

typedef enum
{
    MC_SEARCH_HEX_E_OK,
    MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
    MC_SEARCH_HEX_E_INVALID_CHARACTER,
    MC_SEARCH_HEX_E_UNMATCHED_QUOTES
} mc_search_hex_parse_error_t;

/*** file scope type declarations ****************************************************************/

/*** file scope variables ************************************************************************/

/*** file scope functions ************************************************************************/

static GString *
mc_search__hex_translate_to_regex (const GString * astr, mc_search_hex_parse_error_t * error_ptr,
                                   int *error_pos_ptr)
{
    GString *buff;
    const char *str;
    gsize str_len;
    gsize loop = 0;
    mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;

    buff = g_string_sized_new (64);
    str = astr->str;
    str_len = astr->len;

    while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
    {
        unsigned int val;
        int ptr;

        if (g_ascii_isspace (str[loop]))
        {
            /* Eat-up whitespace between tokens. */
            while (g_ascii_isspace (str[loop]))
                loop++;
        }
        /* cppcheck-suppress invalidscanf */
        else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
        {
            if (val > 255)
                error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
            else
            {
                g_string_append_printf (buff, "\\x%02X", val);
                loop += ptr;
            }
        }
        else if (str[loop] == '"')
        {
            gsize loop2;

            loop2 = loop + 1;

            while (loop2 < str_len)
            {
                if (str[loop2] == '"')
                    break;
                if (str[loop2] == '\\' && loop2 + 1 < str_len)
                    loop2++;
                g_string_append_c (buff, str[loop2]);
                loop2++;
            }

            if (str[loop2] == '\0')
                error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
            else
                loop = loop2 + 1;
        }
        else
            error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
    }

    if (error != MC_SEARCH_HEX_E_OK)
    {
        g_string_free (buff, TRUE);
        if (error_ptr != NULL)
            *error_ptr = error;
        if (error_pos_ptr != NULL)
            *error_pos_ptr = loop;
        return NULL;
    }

    return buff;
}

/*** public functions ****************************************************************************/

void
mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t * lc_mc_search,
                                     mc_search_cond_t * mc_search_cond)
{
    GString *tmp;
    mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
    int error_pos = 0;

    /*
     * We may be searching in binary data, which is often invalid UTF-8.
     *
     * We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
     * the data is invalid UTF-8, both GLib's PCRE and our
     * mc_search__g_regex_match_full_safe() are going to fail us. The former by
     * not finding all bytes, the latter by overwriting the supposedly invalid
     * UTF-8 with NULs.
     *
     * To do this, we specify "ASCII" as the charset.
     *
     * In fact, we can specify any charset other than "UTF-8": any such charset
     * will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
     * for all charsets because it skips the \xXX symbols
     * mc_search__hex_translate_to_regex() outputs.
     *
     * But "ASCII" is the best choice because a hex pattern may contain a
     * quoted string: this way we know [2] will ignore any characters outside
     * ASCII letters range (these ignored chars will be copied verbatim to the
     * output and will match as-is; in other words, in a case-sensitive manner;
     * If the user is interested in case-insensitive searches of international
     * text, he shouldn't be using hex search in the first place.)
     *
     * Switching out of UTF-8 has another advantage:
     *
     * When doing case-insensitive searches, GLib treats \xXX symbols as normal
     * letters and therefore matches both "a" and "A" for the hex pattern
     * "0x61". When we switch out of UTF-8, we're switching to using [2], which
     * doesn't have this issue.
     *
     * [1] mc_search__cond_struct_new_init_regex
     * [2] mc_search__cond_struct_new_regex_ci_str
     */
    if (str_isutf8 (charset))
        charset = "ASCII";

    tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
    if (tmp != NULL)
    {
        g_string_free (mc_search_cond->str, TRUE);
        mc_search_cond->str = tmp;
        mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
    }
    else
    {
        const char *desc;

        switch (error)
        {
        case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
            desc =
                _
                ("Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
            break;
        case MC_SEARCH_HEX_E_INVALID_CHARACTER:
            desc = _("Invalid character");
            break;
        case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
            desc = _("Unmatched quotes character");
            break;
        default:
            desc = "";
        }

        lc_mc_search->error = MC_SEARCH_E_INPUT;
        lc_mc_search->error_str =
            g_strdup_printf (_("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
    }
}

/* --------------------------------------------------------------------------------------------- */

gboolean
mc_search__run_hex (mc_search_t * lc_mc_search, const void *user_data,
                    gsize start_search, gsize end_search, gsize * found_len)
{
    return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
}

/* --------------------------------------------------------------------------------------------- */

GString *
mc_search_hex_prepare_replace_str (mc_search_t * lc_mc_search, GString * replace_str)
{
    (void) lc_mc_search;
    return g_string_new_len (replace_str->str, replace_str->len);
}

/* --------------------------------------------------------------------------------------------- */