diff --git a/doc/txtvsbin.txt b/doc/txtvsbin.txt index 3d0f063..2a901ea 100644 --- a/doc/txtvsbin.txt +++ b/doc/txtvsbin.txt @@ -38,15 +38,15 @@ The Algorithm The algorithm works by dividing the set of bytecodes [0..255] into three categories: -- The white list of textual bytecodes: +- The allow list of textual bytecodes: 9 (TAB), 10 (LF), 13 (CR), 32 (SPACE) to 255. - The gray list of tolerated bytecodes: 7 (BEL), 8 (BS), 11 (VT), 12 (FF), 26 (SUB), 27 (ESC). -- The black list of undesired, non-textual bytecodes: +- The block list of undesired, non-textual bytecodes: 0 (NUL) to 6, 14 to 31. -If a file contains at least one byte that belongs to the white list and -no byte that belongs to the black list, then the file is categorized as +If a file contains at least one byte that belongs to the allow list and +no byte that belongs to the block list, then the file is categorized as plain text; otherwise, it is categorized as binary. (The boundary case, when the file is empty, automatically falls into the latter category.) @@ -84,9 +84,9 @@ consistent results, regardless what alphabet encoding is being used. results on a text encoded, say, using ISO-8859-16 versus UTF-8.) There is an extra category of plain text files that are "polluted" with -one or more black-listed codes, either by mistake or by peculiar design +one or more block-listed codes, either by mistake or by peculiar design considerations. In such cases, a scheme that tolerates a small fraction -of black-listed codes would provide an increased recall (i.e. more true +of block-listed codes would provide an increased recall (i.e. more true positives). This, however, incurs a reduced precision overall, since false positives are more likely to appear in binary files that contain large chunks of textual data. Furthermore, "polluted" plain text should diff --git a/trees.c b/trees.c index decaeb7..6896067 100644 --- a/trees.c +++ b/trees.c @@ -1091,9 +1091,9 @@ local void compress_block(s, ltree, dtree) * Check if the data type is TEXT or BINARY, using the following algorithm: * - TEXT if the two conditions below are satisfied: * a) There are no non-portable control characters belonging to the - * "black list" (0..6, 14..25, 28..31). + * "block list" (0..6, 14..25, 28..31). * b) There is at least one printable character belonging to the - * "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255). + * "allow list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255). * - BINARY otherwise. * - The following partially-portable control characters form a * "gray list" that is ignored in this detection algorithm: @@ -1103,19 +1103,19 @@ local void compress_block(s, ltree, dtree) local int detect_data_type(s) deflate_state *s; { - /* black_mask is the bit mask of black-listed bytes + /* block_mask is the bit mask of block-listed bytes * set bits 0..6, 14..25, and 28..31 * 0xf3ffc07f = binary 11110011111111111100000001111111 */ - unsigned long black_mask = 0xf3ffc07fUL; + unsigned long block_mask = 0xf3ffc07fUL; int n; - /* Check for non-textual ("black-listed") bytes. */ - for (n = 0; n <= 31; n++, black_mask >>= 1) - if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0)) + /* Check for non-textual ("block-listed") bytes. */ + for (n = 0; n <= 31; n++, block_mask >>= 1) + if ((block_mask & 1) && (s->dyn_ltree[n].Freq != 0)) return Z_BINARY; - /* Check for textual ("white-listed") bytes. */ + /* Check for textual ("allow-listed") bytes. */ if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0 || s->dyn_ltree[13].Freq != 0) return Z_TEXT; @@ -1123,7 +1123,7 @@ local int detect_data_type(s) if (s->dyn_ltree[n].Freq != 0) return Z_TEXT; - /* There are no "black-listed" or "white-listed" bytes: + /* There are no "block-listed" or "allow-listed" bytes: * this stream either is empty or has tolerated ("gray-listed") bytes only. */ return Z_BINARY;