Replace black/white with allow/block. (theresa-m)

This commit is contained in:
Mark Adler 2021-12-31 16:57:07 -08:00
parent c3f3043f7a
commit 8678871f18
2 changed files with 15 additions and 15 deletions

View File

@ -38,15 +38,15 @@ The Algorithm
The algorithm works by dividing the set of bytecodes [0..255] into three The algorithm works by dividing the set of bytecodes [0..255] into three
categories: categories:
- The white list of textual bytecodes: - The allow list of textual bytecodes:
9 (TAB), 10 (LF), 13 (CR), 32 (SPACE) to 255. 9 (TAB), 10 (LF), 13 (CR), 32 (SPACE) to 255.
- The gray list of tolerated bytecodes: - The gray list of tolerated bytecodes:
7 (BEL), 8 (BS), 11 (VT), 12 (FF), 26 (SUB), 27 (ESC). 7 (BEL), 8 (BS), 11 (VT), 12 (FF), 26 (SUB), 27 (ESC).
- The black list of undesired, non-textual bytecodes: - The block list of undesired, non-textual bytecodes:
0 (NUL) to 6, 14 to 31. 0 (NUL) to 6, 14 to 31.
If a file contains at least one byte that belongs to the white list and If a file contains at least one byte that belongs to the allow list and
no byte that belongs to the black list, then the file is categorized as no byte that belongs to the block list, then the file is categorized as
plain text; otherwise, it is categorized as binary. (The boundary case, plain text; otherwise, it is categorized as binary. (The boundary case,
when the file is empty, automatically falls into the latter category.) when the file is empty, automatically falls into the latter category.)
@ -84,9 +84,9 @@ consistent results, regardless what alphabet encoding is being used.
results on a text encoded, say, using ISO-8859-16 versus UTF-8.) results on a text encoded, say, using ISO-8859-16 versus UTF-8.)
There is an extra category of plain text files that are "polluted" with There is an extra category of plain text files that are "polluted" with
one or more black-listed codes, either by mistake or by peculiar design one or more block-listed codes, either by mistake or by peculiar design
considerations. In such cases, a scheme that tolerates a small fraction considerations. In such cases, a scheme that tolerates a small fraction
of black-listed codes would provide an increased recall (i.e. more true of block-listed codes would provide an increased recall (i.e. more true
positives). This, however, incurs a reduced precision overall, since positives). This, however, incurs a reduced precision overall, since
false positives are more likely to appear in binary files that contain false positives are more likely to appear in binary files that contain
large chunks of textual data. Furthermore, "polluted" plain text should large chunks of textual data. Furthermore, "polluted" plain text should

18
trees.c
View File

@ -1091,9 +1091,9 @@ local void compress_block(s, ltree, dtree)
* Check if the data type is TEXT or BINARY, using the following algorithm: * Check if the data type is TEXT or BINARY, using the following algorithm:
* - TEXT if the two conditions below are satisfied: * - TEXT if the two conditions below are satisfied:
* a) There are no non-portable control characters belonging to the * a) There are no non-portable control characters belonging to the
* "black list" (0..6, 14..25, 28..31). * "block list" (0..6, 14..25, 28..31).
* b) There is at least one printable character belonging to the * b) There is at least one printable character belonging to the
* "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255). * "allow list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
* - BINARY otherwise. * - BINARY otherwise.
* - The following partially-portable control characters form a * - The following partially-portable control characters form a
* "gray list" that is ignored in this detection algorithm: * "gray list" that is ignored in this detection algorithm:
@ -1103,19 +1103,19 @@ local void compress_block(s, ltree, dtree)
local int detect_data_type(s) local int detect_data_type(s)
deflate_state *s; deflate_state *s;
{ {
/* black_mask is the bit mask of black-listed bytes /* block_mask is the bit mask of block-listed bytes
* set bits 0..6, 14..25, and 28..31 * set bits 0..6, 14..25, and 28..31
* 0xf3ffc07f = binary 11110011111111111100000001111111 * 0xf3ffc07f = binary 11110011111111111100000001111111
*/ */
unsigned long black_mask = 0xf3ffc07fUL; unsigned long block_mask = 0xf3ffc07fUL;
int n; int n;
/* Check for non-textual ("black-listed") bytes. */ /* Check for non-textual ("block-listed") bytes. */
for (n = 0; n <= 31; n++, black_mask >>= 1) for (n = 0; n <= 31; n++, block_mask >>= 1)
if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0)) if ((block_mask & 1) && (s->dyn_ltree[n].Freq != 0))
return Z_BINARY; return Z_BINARY;
/* Check for textual ("white-listed") bytes. */ /* Check for textual ("allow-listed") bytes. */
if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0 if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
|| s->dyn_ltree[13].Freq != 0) || s->dyn_ltree[13].Freq != 0)
return Z_TEXT; return Z_TEXT;
@ -1123,7 +1123,7 @@ local int detect_data_type(s)
if (s->dyn_ltree[n].Freq != 0) if (s->dyn_ltree[n].Freq != 0)
return Z_TEXT; return Z_TEXT;
/* There are no "black-listed" or "white-listed" bytes: /* There are no "block-listed" or "allow-listed" bytes:
* this stream either is empty or has tolerated ("gray-listed") bytes only. * this stream either is empty or has tolerated ("gray-listed") bytes only.
*/ */
return Z_BINARY; return Z_BINARY;