NetBSD/usr.bin/sort/fields.c

381 lines
11 KiB
C

/* $NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Ben Harris and Jaromir Dolecek.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Peter McIlroy.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* Subroutines to generate sort keys. */
#include "sort.h"
__RCSID("$NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $");
#define SKIP_BLANKS(ptr) { \
if (BLANK & d_mask[*(ptr)]) \
while (BLANK & d_mask[*(++(ptr))]); \
}
#define NEXTCOL(pos) { \
if (!SEP_FLAG) \
while (BLANK & l_d_mask[*(++pos)]); \
while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
}
static u_char *enterfield(u_char *, const u_char *, struct field *, int);
static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
static u_char *length(u_char *, const u_char *, u_char *, u_char *, int);
#define DECIMAL_POINT '.'
/*
* constructs sort key with leading recheader, followed by the key,
* followed by the original line.
*/
length_t
enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
size_t line_size, struct field fieldtable[])
/* keybuf: pointer to start of key */
{
int i;
u_char *l_d_mask;
u_char *lineend, *pos;
const u_char *endkey;
u_char *keypos;
struct coldesc *clpos;
int col = 1;
struct field *ftpos;
l_d_mask = d_mask;
pos = line_data - 1;
lineend = line_data + line_size-1;
/* don't include rec_delimiter */
for (i = 0; i < ncols; i++) {
clpos = clist + i;
for (; (col < clpos->num) && (pos < lineend); col++) {
NEXTCOL(pos);
}
if (pos >= lineend)
break;
clpos->start = SEP_FLAG ? pos + 1 : pos;
NEXTCOL(pos);
clpos->end = pos;
col++;
if (pos >= lineend) {
clpos->end = lineend;
i++;
break;
}
}
for (; i <= ncols; i++)
clist[i].start = clist[i].end = lineend;
if (clist[0].start < line_data)
clist[0].start++;
/*
* We write the sort keys (concatenated) followed by the
* original line data (for output) as the 'keybuf' data.
* keybuf->length is the number of key bytes + data bytes.
* keybuf->offset is the number of key bytes.
* We add a record separator weight after the key in case
* (as is usual) we need to preserve the order of equal lines,
* and for 'sort -u'.
* The key itself will have had the correct weight applied.
*/
keypos = keybuf->data;
endkey = keybuf_end - line_size - 1;
if (endkey <= keypos)
/* No room for any key bytes */
return 1;
for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
if ((keypos = enterfield(keypos, endkey, ftpos,
fieldtable->flags)) == NULL)
return (1);
}
keybuf->offset = keypos - keybuf->data;
keybuf->length = keybuf->offset + line_size;
/*
* Posix requires that equal keys be further sorted by the
* entire original record.
* NetBSD has (at least for some time) kept equal keys in
* their original order.
* For 'sort -u' posix_sort is unset.
*/
keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
memcpy(keypos, line_data, line_size);
return (0);
}
/*
* constructs a field (as defined by -k) within a key
*/
static u_char *
enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
int gflags)
{
u_char *start, *end, *lineend, *mask, *lweight;
struct column icol, tcol;
u_int flags;
icol = cur_fld->icol;
tcol = cur_fld->tcol;
flags = cur_fld->flags;
start = icol.p->start;
lineend = clist[ncols].end;
if (flags & BI)
SKIP_BLANKS(start);
start += icol.indent;
start = min(start, lineend);
if (!tcol.num)
end = lineend;
else {
if (tcol.indent) {
end = tcol.p->start;
if (flags & BT)
SKIP_BLANKS(end);
end += tcol.indent;
end = min(end, lineend);
} else
end = tcol.p->end;
}
if (flags & L)
return length(tablepos, endkey, start, end, flags);
if (flags & N)
return number(tablepos, endkey, start, end, flags);
/* Bound check space - assuming nothing is skipped */
if (tablepos + (end - start) + 1 >= endkey)
return NULL;
mask = cur_fld->mask;
lweight = cur_fld->weights;
for (; start < end; start++) {
if (!mask || mask[*start]) {
*tablepos++ = lweight[*start];
}
}
/* Add extra byte (absent from lweight) to sort short keys correctly */
*tablepos++ = lweight[REC_D];
return tablepos;
}
/*
* Numbers are converted to a floating point format (exponent & mantissa)
* so that they compare correctly as sequence of unsigned bytes.
* Bytes 0x00 and 0xff are used to terminate positive and negative numbers
* to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
*
* The first byte contain the overall sign, exponent sign and some of the
* exponent. These have to be ordered (-ve value, decreasing exponent),
* zero, (+ve value, increasing exponent).
*
* The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
* -ve values are the 1's compliments (so 0x7f isn't used!).
*
* This only leaves 63 byte values for +ve exponents - which isn't enough.
* The largest 4 exponent values are used to hold a byte count of the
* number of following bytes that contain 8 exponent bits per byte,
* This lets us sort exponents from -2^31 to +2^31.
*
* The mantissa is stored 2 digits per byte offset by 0x40, for negative
* numbers the order must be reversed (they are bit inverted).
*
* Reverse sorts are done by inverting the sign of the number.
*/
#define MAX_EXP_ENC ((int)sizeof(int))
static u_char *
number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
int reverse)
{
int exponent = -1;
int had_dp = 0;
u_char *tline;
char ch;
unsigned int val;
u_char *last_nz_pos;
u_char negate;
if (reverse & R)
negate = 0xff;
else
negate = 0;
/* Give ourselves space for the key terminator */
bufend--;
/* Ensure we have enough space for the exponent */
if (pos + 1 + MAX_EXP_ENC > bufend)
return (NULL);
SKIP_BLANKS(line);
if (*line == '-') { /* set the sign */
negate ^= 0xff;
line++;
} else if (*line == '+') {
line++;
}
/* eat initial zeroes */
for (; *line == '0' && line < lineend; line++)
continue;
/* calculate exponents */
if (*line == DECIMAL_POINT) {
/* Decimal fraction */
had_dp = 1;
while (*++line == '0' && line < lineend)
exponent--;
} else {
/* Large (absolute) value, count digits */
for (tline = line; *tline >= '0' &&
*tline <= '9' && tline < lineend; tline++)
exponent++;
}
/* If the first/next character isn't a digit, value is zero */
if (*line < '1' || *line > '9' || line >= lineend) {
/* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
/* XXX what about NaN, NAN, inf and INF */
*pos++ = 0x80;
return pos;
}
/* Maybe here we should allow for e+12 (etc) */
if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
/* Value ok for simple encoding */
/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
exponent += 0xc0;
*pos++ = negate ^ exponent;
} else {
/* Out or range for a single byte */
int c, t;
t = exponent > 0 ? exponent : -exponent;
/* Count how many 8-bit bytes are needed */
for (c = 0; ; c++) {
t >>= 8;
if (t == 0)
break;
}
/* 'c' better be 0..3 here - but probably 0..1 */
/* Offset just outside valid range */
t = c + 0x40 - MAX_EXP_ENC;
if (exponent < 0)
t = -t;
*pos++ = negate ^ (t + 0xc0);
/* now add each byte, most significant first */
for (; c >= 0; c--)
*pos++ = negate ^ (exponent >> (c * 8));
}
/* Finally add mantissa, 2 digits per byte */
for (last_nz_pos = pos; line < lineend; ) {
if (pos >= bufend)
return NULL;
ch = *line++;
val = (ch - '0') * 10;
if (val > 90) {
if (ch == DECIMAL_POINT && !had_dp) {
had_dp = 1;
continue;
}
break;
}
while (line < lineend) {
ch = *line++;
if (ch == DECIMAL_POINT && !had_dp) {
had_dp = 1;
continue;
}
if (ch < '0' || ch > '9')
line = lineend;
else
val += ch - '0';
break;
}
*pos++ = negate ^ (val + 0x40);
if (val != 0)
last_nz_pos = pos;
}
/* Add key terminator, deleting any trailing "00" */
*last_nz_pos++ = negate;
return (last_nz_pos);
}
static u_char *
length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
int flag)
{
u_char buf[32];
int l;
SKIP_BLANKS(line);
l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line);
return number(pos, bufend, buf, buf + l, flag);
}