diff --git a/test/myhtml/commoncrawl.c b/test/myhtml/commoncrawl.c
index 847ec6f..bbba386 100644
--- a/test/myhtml/commoncrawl.c
+++ b/test/myhtml/commoncrawl.c
@@ -21,272 +21,272 @@
#include
#include
-#include
-#include
-#include
-
-#include
-
-#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
-
-static myhtml_tree_t* global_tree;
-
-#define total_count_size 20
-static size_t total_count[total_count_size];
-
-typedef void (*process_state_f)(const char* data, size_t filename_size);
-typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
-
-void print_total_count(void)
-{
- size_t total = 0;
- for(size_t i = 0; i < 7; i++)
- total += total_count[i];
-
- printf("Total: %zu\n" ,total);
-
- printf("\t0-100: %zu\n", total_count[0]);
- printf("\t100-1000: %zu\n", total_count[1]);
- printf("\t1000-5000: %zu\n", total_count[2]);
- printf("\t5000-10000: %zu\n", total_count[3]);
- printf("\t10000-50000: %zu\n", total_count[4]);
- printf("\t50000-100000: %zu\n", total_count[5]);
- printf("\t100000 and up: %zu\n", total_count[6]);
-}
-
-void listdir(const char *name, process_state_f callback)
-{
- memset(total_count, 0, sizeof(size_t) * total_count_size);
-
- DIR *dir;
- struct dirent *entry;
-
- if(!(dir = opendir(name)))
- return;
- if(!(entry = readdir(dir)))
- return;
-
- do {
- if(entry->d_type == DT_DIR) {
- char path[2048];
-
- int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
- path[len] = '\0';
-
- if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
- continue;
-
- listdir(path, callback);
- }
- else {
- char path[2048];
-
- int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
- path[len] = '\0';
-
- if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
- callback(path, len);
- }
- }
- }
- while ((entry = readdir(dir)));
-
- closedir(dir);
-}
-
-void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
-{
- memset(total_count, 0, sizeof(size_t) * total_count_size);
-
- FILE *fh = fopen(filename, "rb");
- if(fh == NULL) {
- fprintf(stderr, "Can't open html file: %s\n", filename);
- exit(EXIT_FAILURE);
- }
-
- fseek(fh, 0L, SEEK_END);
- long size = ftell(fh);
- fseek(fh, 0L, SEEK_SET);
-
- char *data = (char*)malloc(size + 1);
- if(data == NULL) {
- fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
- exit(EXIT_FAILURE);
- }
-
- size_t nread = fread(data, 1, size, fh);
- if (nread != size) {
- fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
- exit(EXIT_FAILURE);
- }
-
- fclose(fh);
-
- if(size < 0)
- size = 0;
-
- size_t from = 0;
- char path[2048];
-
- for(size_t i = 0; i < size; i++) {
- if(data[i] == '\n') {
- int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
- path[len] = '\0';
-
- callback(path, len);
-
- from = i + 1;
- }
- }
-
- free(data);
-}
-
-void process(const char* filename, size_t filename_size, parser_state_f parser)
-{
- FILE *fh = fopen(filename, "rb");
- if(fh == NULL) {
- fprintf(stderr, "Can't open html file: %s\n", filename);
- exit(EXIT_FAILURE);
- }
-
- fseek(fh, 0L, SEEK_SET);
-
- const char *ct = "Content-Length:";
- size_t ct_size = strlen(ct);
-
- char * line = NULL;
- long get_size = 0;
- ssize_t read = 0;
-
- size_t count = 0, read_len = 0;
-
- while ((read = getline(&line, &read_len, fh)) != -1) {
-
- if(strncmp(ct, line, ct_size) == 0) {
- size_t i;
-
- for(i = ct_size; i < read_len; i++)
- if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
- break;
-
- get_size = strtol(&line[i], NULL, 0);
- }
- else if(get_size && line[0] == '\r' && line[1] == '\n') {
- long head_begin = ftell(fh) + 2;
- long end = head_begin + get_size;
-
- while ((read = getline(&line, &read_len, fh)) != -1) {
- //printf("%.*s", (int)read_len, line);
-
- if(line[0] == '\r' && line[1] == '\n')
- break;
- }
-
- long head_end = ftell(fh);
-
- size_t html_length = (end - head_end);
- char *html = malloc(html_length + 1);
-
- size_t nread = fread(html, 1, html_length, fh);
- if (nread != html_length) {
- fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
- exit(EXIT_FAILURE);
- }
-
- count++;
- parser(html, html_length, count);
-
- get_size = 0;
- free(html);
- }
- }
-
- fclose(fh);
-}
-
-void html_parser(const char* html, size_t html_length, size_t count)
-{
- if((count % 1000) == 0) {
- printf("\t%zu\n", count);
- }
-
- myencoding_t encoding = 0;
- //myhtml_encoding_detect(html, html_length, &encoding);
-
- // parse html
- myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
- if(status != MyHTML_STATUS_OK) {
- fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
- exit(EXIT_FAILURE);
- }
-
- if(html_length < 100)
- total_count[0]++;
- else if(html_length >= 100 && html_length < 1000)
- total_count[1]++;
- else if(html_length >= 1000 && html_length < 5000)
- total_count[2]++;
- else if(html_length >= 5000 && html_length < 10000)
- total_count[3]++;
- else if(html_length >= 10000 && html_length < 50000)
- total_count[4]++;
- else if(html_length >= 50000 && html_length < 100000)
- total_count[5]++;
- else if(html_length >= 100000)
- total_count[6]++;
-
- //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
-}
-
-void process_unpack(const char* filename, size_t filename_size)
-{
- char command[2048];
- snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
-
- printf("Unzip %s\n", filename);
-
- system(command);
-
- char new_path[2048];
- size_t new_path_size = (filename_size - 3);
-
- snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
-
- printf("Process %s:\n", new_path);
- process(new_path, new_path_size, html_parser);
- printf("\n");
-
- unlink(new_path);
-}
-
-static void usage(void)
-{
- fprintf(stderr, "commoncrawl \n");
-}
-
+//#include
+//#include
+//#include
+//
+//#include
+//
+//#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+//
+//static myhtml_tree_t* global_tree;
+//
+//#define total_count_size 20
+//static size_t total_count[total_count_size];
+//
+//typedef void (*process_state_f)(const char* data, size_t filename_size);
+//typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
+//
+//void print_total_count(void)
+//{
+// size_t total = 0;
+// for(size_t i = 0; i < 7; i++)
+// total += total_count[i];
+//
+// printf("Total: %zu\n" ,total);
+//
+// printf("\t0-100: %zu\n", total_count[0]);
+// printf("\t100-1000: %zu\n", total_count[1]);
+// printf("\t1000-5000: %zu\n", total_count[2]);
+// printf("\t5000-10000: %zu\n", total_count[3]);
+// printf("\t10000-50000: %zu\n", total_count[4]);
+// printf("\t50000-100000: %zu\n", total_count[5]);
+// printf("\t100000 and up: %zu\n", total_count[6]);
+//}
+//
+//void listdir(const char *name, process_state_f callback)
+//{
+// memset(total_count, 0, sizeof(size_t) * total_count_size);
+//
+// DIR *dir;
+// struct dirent *entry;
+//
+// if(!(dir = opendir(name)))
+// return;
+// if(!(entry = readdir(dir)))
+// return;
+//
+// do {
+// if(entry->d_type == DT_DIR) {
+// char path[2048];
+//
+// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
+// path[len] = '\0';
+//
+// if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+// continue;
+//
+// listdir(path, callback);
+// }
+// else {
+// char path[2048];
+//
+// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
+// path[len] = '\0';
+//
+// if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
+// callback(path, len);
+// }
+// }
+// }
+// while ((entry = readdir(dir)));
+//
+// closedir(dir);
+//}
+//
+//void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
+//{
+// memset(total_count, 0, sizeof(size_t) * total_count_size);
+//
+// FILE *fh = fopen(filename, "rb");
+// if(fh == NULL) {
+// fprintf(stderr, "Can't open html file: %s\n", filename);
+// exit(EXIT_FAILURE);
+// }
+//
+// fseek(fh, 0L, SEEK_END);
+// long size = ftell(fh);
+// fseek(fh, 0L, SEEK_SET);
+//
+// char *data = (char*)malloc(size + 1);
+// if(data == NULL) {
+// fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
+// exit(EXIT_FAILURE);
+// }
+//
+// size_t nread = fread(data, 1, size, fh);
+// if (nread != size) {
+// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
+// exit(EXIT_FAILURE);
+// }
+//
+// fclose(fh);
+//
+// if(size < 0)
+// size = 0;
+//
+// size_t from = 0;
+// char path[2048];
+//
+// for(size_t i = 0; i < size; i++) {
+// if(data[i] == '\n') {
+// int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
+// path[len] = '\0';
+//
+// callback(path, len);
+//
+// from = i + 1;
+// }
+// }
+//
+// free(data);
+//}
+//
+//void process(const char* filename, size_t filename_size, parser_state_f parser)
+//{
+// FILE *fh = fopen(filename, "rb");
+// if(fh == NULL) {
+// fprintf(stderr, "Can't open html file: %s\n", filename);
+// exit(EXIT_FAILURE);
+// }
+//
+// fseek(fh, 0L, SEEK_SET);
+//
+// const char *ct = "Content-Length:";
+// size_t ct_size = strlen(ct);
+//
+// char * line = NULL;
+// long get_size = 0;
+// ssize_t read = 0;
+//
+// size_t count = 0, read_len = 0;
+//
+// while ((read = getline(&line, &read_len, fh)) != -1) {
+//
+// if(strncmp(ct, line, ct_size) == 0) {
+// size_t i;
+//
+// for(i = ct_size; i < read_len; i++)
+// if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
+// break;
+//
+// get_size = strtol(&line[i], NULL, 0);
+// }
+// else if(get_size && line[0] == '\r' && line[1] == '\n') {
+// long head_begin = ftell(fh) + 2;
+// long end = head_begin + get_size;
+//
+// while ((read = getline(&line, &read_len, fh)) != -1) {
+// //printf("%.*s", (int)read_len, line);
+//
+// if(line[0] == '\r' && line[1] == '\n')
+// break;
+// }
+//
+// long head_end = ftell(fh);
+//
+// size_t html_length = (end - head_end);
+// char *html = malloc(html_length + 1);
+//
+// size_t nread = fread(html, 1, html_length, fh);
+// if (nread != html_length) {
+// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
+// exit(EXIT_FAILURE);
+// }
+//
+// count++;
+// parser(html, html_length, count);
+//
+// get_size = 0;
+// free(html);
+// }
+// }
+//
+// fclose(fh);
+//}
+//
+//void html_parser(const char* html, size_t html_length, size_t count)
+//{
+// if((count % 1000) == 0) {
+// printf("\t%zu\n", count);
+// }
+//
+// myencoding_t encoding = 0;
+// //myhtml_encoding_detect(html, html_length, &encoding);
+//
+// // parse html
+// myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
+// if(status != MyHTML_STATUS_OK) {
+// fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
+// exit(EXIT_FAILURE);
+// }
+//
+// if(html_length < 100)
+// total_count[0]++;
+// else if(html_length >= 100 && html_length < 1000)
+// total_count[1]++;
+// else if(html_length >= 1000 && html_length < 5000)
+// total_count[2]++;
+// else if(html_length >= 5000 && html_length < 10000)
+// total_count[3]++;
+// else if(html_length >= 10000 && html_length < 50000)
+// total_count[4]++;
+// else if(html_length >= 50000 && html_length < 100000)
+// total_count[5]++;
+// else if(html_length >= 100000)
+// total_count[6]++;
+//
+// //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
+//}
+//
+//void process_unpack(const char* filename, size_t filename_size)
+//{
+// char command[2048];
+// snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
+//
+// printf("Unzip %s\n", filename);
+//
+// system(command);
+//
+// char new_path[2048];
+// size_t new_path_size = (filename_size - 3);
+//
+// snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
+//
+// printf("Process %s:\n", new_path);
+// process(new_path, new_path_size, html_parser);
+// printf("\n");
+//
+// unlink(new_path);
+//}
+//
+//static void usage(void)
+//{
+// fprintf(stderr, "commoncrawl \n");
+//}
+//
int main(int argc, const char * argv[])
{
- if (argc != 2) {
- usage();
- return 0;
- }
-
- // basic init
- myhtml_t* myhtml = myhtml_create();
- myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
-
- // first tree init
- global_tree = myhtml_tree_create();
- myhtml_tree_init(global_tree, myhtml);
-
- listdir(argv[1], process_unpack);
-
- // release resources
- myhtml_tree_destroy(global_tree);
- myhtml_destroy(myhtml);
-
- print_total_count();
+// if (argc != 2) {
+// usage();
+// return 0;
+// }
+//
+// // basic init
+// myhtml_t* myhtml = myhtml_create();
+// myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
+//
+// // first tree init
+// global_tree = myhtml_tree_create();
+// myhtml_tree_init(global_tree, myhtml);
+//
+// listdir(argv[1], process_unpack);
+//
+// // release resources
+// myhtml_tree_destroy(global_tree);
+// myhtml_destroy(myhtml);
+//
+// print_total_count();
return 0;
}