Added new rule for make (make modules)

This commit is contained in:
lexborisov 2017-03-20 21:57:29 +03:00
parent 3b3625cb1b
commit 2960726738
16 changed files with 401 additions and 70 deletions

View File

@ -1,51 +1,47 @@
# Modest: Build and Installation
## make
## GNU Make
In root directory:
In root directory of project (`/`):
```bash
make
```
If successful copy lib/* and include/* at the right place for you
Flags that can be passed to make:
- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2
- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO
*for example*
```bash
make MODEST_BUILD_WITHOUT_THREADS=NO
```
and copy to the right place for you
```bash
cp lib/* /usr/local/lib
cp -r include/* /usr/local/include
```
## cmake
In `project` directory:
```bash
cmake .
make
make test
sudo make install
```
Flags that can be passed to CMake:
- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2
- `CMAKE_INSTALL_LIBDIR=lib` set path to install created library. Default: lib
- `MODEST_BUILD_SHARED=ON` build shared library. Default: ON
- `MODEST_BUILD_STATIC=ON` build static library. Default: ON
- `MODEST_INSTALL_HEADER=OFF` install header files. Default ON
- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO
- `MODEST_EXTERN_MALLOC=my_malloc_func` set extern malloc function. Default: UNDEFINED
- `MODEST_EXTERN_REALLOC=my_realloc_func` set extern realloc function. Default: UNDEFINED
- `MODEST_EXTERN_CALLOC=my_calloc_func` set extern calloc function. Default: UNDEFINED
- `MODEST_EXTERN_FREE=my_free_func` set extern free function. Default: UNDEFINED
Flags that can be passed to make:
- `prefix`, default /usr/local
- `OS`, if not defined try to get from "uname -s"
- `PROJECT_OPTIMIZATION_LEVEL`, default -O2
- `MyCORE_BUILD_WITHOUT_THREADS`, YES or (NO or undefined), default undefined
- `MyCORE_BUILD_DEBUG`, YES or (NO or undefined), default undefined
- `MyCORE_WITH_PERF`, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems,
- `PROJECT_INSTALL_HEADER`, default "include"
- `PROJECT_INSTALL_LIBRARY`, default "lib"
- `PROJECT_INSTALL_WITHOUT_HEADERS`, YES or (NO or undefined), default undefined
*for example*
*for example*:
```bash
cmake . -DCMAKE_INSTALL_LIBDIR=lib64 -DMODEST_INSTALL_HEADER=ON
make -j4 prefix=/usr MyCORE_BUILD_WITHOUT_THREADS=YES
sudo make install
```
Makefile rules:
- `all` —- build all components (libraries, examples, tests) (default)
- `library` -- build only static and shared library
- `shared` -- build only shared library
- `static` -- build only static library
- `clean` -- clean up current build directory
- `clone` -- copy all headers from source to include directories and modify local include (`#include "..."`) to global (`#include <...>`)
- `clean_api` -- remove all headers from include directory
- `create` -- create directories for binary, libraries, tests
- `test` -- run all tests
- `modules` -- print modules name, description, dependencies
- `install` -- install libraries and headers on your system
- `uninstall` -- delete libraries and headers on your system
- `make-pc-file` -- create pkg-config file
*for example*:
```bash
make shared
```

View File

@ -15,18 +15,19 @@ CC ?= gcc
# install -- install libraries and headers on your system
# uninstall -- delete libraries and headers on your system
# test -- run all tests
# modules -- print modules list: Module name, Description, Dependencies
# make-pc-file -- create pkg-config file
#
# ARGS
# prefix, default /usr/local
# OS, if not defined try to get from "uname -s"
# MODEST_OPTIMIZATION_LEVEL, default -O2
# PROJECT_OPTIMIZATION_LEVEL, default -O2
# MyCORE_BUILD_WITHOUT_THREADS, YES or (NO or undefined), default undefined
# MyCORE_BUILD_DEBUG, YES or (NO or undefined), default undefined
# MyCORE_WITH_PERF, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems,
# MODEST_INSTALL_HEADER, default "include"
# MODEST_INSTALL_LIBRARY, default "lib"
# MODEST_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined
# PROJECT_INSTALL_HEADER, default "include"
# PROJECT_INSTALL_LIBRARY, default "lib"
# PROJECT_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined
#
# If OS build rules not exists we try make library with POSIX threads
@ -76,6 +77,12 @@ MyPORT_SELECTED_PORT = myport/$(strip $(MODEST_PORT_NAME))
#***************
include $(MODEST_BUILD_MODULES_MAKEFILES_LIST)
#********************
# Modules info
#***************
MODEST_BUILD_MODULES_INFO_DEP = $(foreach dep,$(strip $($1_dependencies)), $(dep))
MODEST_BUILD_MODULES_INFO := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),$(MODEST_UTILS_NEW_LINE)Module: $(name)$(MODEST_UTILS_NEW_LINE)Description: $($(name)_description)$(MODEST_UTILS_NEW_LINE)Dependencies:$(call MODEST_BUILD_MODULES_INFO_DEP,$(name))$(MODEST_UTILS_NEW_LINE))
#********************
# Set ARGS for flags
#***************
@ -102,17 +109,17 @@ BUILD_SUB_DIRS := examples $(TEST_DIR)
#********************
# Install
#***************
MODEST_INSTALL_LIBRARY := lib
MODEST_INSTALL_HEADER := include
PROJECT_INSTALL_LIBRARY := lib
PROJECT_INSTALL_HEADER := include
libdir ?= $(prefix)/$(MODEST_INSTALL_LIBRARY)
includedir ?= $(prefix)/$(MODEST_INSTALL_HEADER)
libdir ?= $(prefix)/$(PROJECT_INSTALL_LIBRARY)
includedir ?= $(prefix)/$(PROJECT_INSTALL_HEADER)
MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(MODEST_INSTALL_LIBRARY)
MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(PROJECT_INSTALL_LIBRARY)
MODEST_INSTALL_COMMAND := $(MODEST_INSTALL_CREATE_DIR) $(MODEST_UTILS_NEW_LINE) cp -av $(LIB_DIR_BASE)/* $(libdir)
ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES)
MODEST_INSTALL_CREATE_DIR += $(prefix)/$(MODEST_INSTALL_HEADER)
ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES)
MODEST_INSTALL_CREATE_DIR += $(prefix)/$(PROJECT_INSTALL_HEADER)
MODEST_INSTALL_COMMAND += $(MODEST_UTILS_NEW_LINE) cp -av $(INCLUDE_DIR_API)/* $(includedir)
endif
@ -122,7 +129,7 @@ endif
MODEST_UNINSTALL_MK_COMMAND :=
MODEST_UNINSTALL_FILE := uninstal.mk
ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES)
ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES)
MODEST_UNINSTALL_HEADERS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),rm -rf $(includedir)/$(name) \$$(MODEST_UTILS_NEW_LINE))
endif
@ -138,11 +145,11 @@ MODEST_PKG_CONFIG_FILE := modest.pc
MODEST_PKG_CONFIG_CFLAGS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),-I$\{includedir}/$(name))
MODEST_PKG_CONFIG_PROCESS = \
$(SED) \
-e 's,@version\@,$(MODEST_VERSION_STRING),g' \
-e 's,@version\@,$(PROJECT_VERSION_STRING),g' \
-e 's,@prefix\@,$(prefix),g' \
-e 's,@exec_prefix\@,$(exec_prefix),g' \
-e 's,@libdir\@,$(MODEST_INSTALL_LIBRARY),g' \
-e 's,@includedir\@,$(MODEST_INSTALL_HEADER),g' \
-e 's,@libdir\@,$(PROJECT_INSTALL_LIBRARY),g' \
-e 's,@includedir\@,$(PROJECT_INSTALL_HEADER),g' \
-e 's,@cflags\@,$(MODEST_PKG_CONFIG_CFLAGS),g' \
-e 's,@libname\@,$(LIB_NAME),g' \
-e 's,@description\@,$(DESCRIPTION),g' \
@ -193,4 +200,7 @@ test: library
make-pc-file:
$(call MODEST_PKG_CONFIG_PROCESS,$(MODEST_PKG_CONFIG_FILE).in, $(MODEST_PKG_CONFIG_FILE))
modules:
$(info $(MODEST_BUILD_MODULES_INFO))
.PHONY: all clean clone test $(MODEST_BUILD_MODULES_TARGET_ALL)

View File

@ -3,11 +3,11 @@ MODEST_BUILD_OS := UNDEF
#********************
# Version
#***************
MODEST_VERSION_MAJOR := 0
MODEST_VERSION_MINOR := 0
MODEST_VERSION_PATCH := 6
PROJECT_VERSION_MAJOR := 0
PROJECT_VERSION_MINOR := 0
PROJECT_VERSION_PATCH := 6
MODEST_VERSION_STRING := $(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR).$(MODEST_VERSION_PATCH)
PROJECT_VERSION_STRING := $(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR).$(PROJECT_VERSION_PATCH)
#********************
# Flags
@ -37,14 +37,14 @@ LIB_DIR_BASE := lib
# for use actual variables like a LIB_NAME_SUFFIX
MODEST_LIBRARY_NAME ?= lib$(LIB_NAME)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_NAME_STATIC ?=lib$(LIB_NAME)$(LIB_NAME_SUFFIX_STATIC)
MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(MODEST_VERSION_STRING)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(PROJECT_VERSION_STRING)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME)
MODEST_LIBRARY_STATIC ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_STATIC)
MODEST_LIBRARY_WITH_VERSION = $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_WITH_VERSION)
MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR)$(LIB_NAME_SUFFIX)
#********************
# Binaries
@ -75,7 +75,7 @@ SED ?= sed
MODEST_DIR_SEPARATOR ?= /
# flags
MODEST_OPTIMIZATION_LEVEL ?= -O2
PROJECT_OPTIMIZATION_LEVEL ?= -O2
MODEST_CFLAGS += -I$(INCLUDE_DIR)
MODEST_LDFLAGS +=
@ -89,7 +89,7 @@ MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed
# Set -D
#***************
ifeq ($(MyCORE_BUILD_DEBUG),YES)
override MODEST_OPTIMIZATION_LEVEL :=
override PROJECT_OPTIMIZATION_LEVEL :=
MODEST_CFLAGS += -g3 -ggdb3 -O0 -fno-omit-frame-pointer -DMyCORE_BUILD_DEBUG
endif
@ -128,7 +128,7 @@ endif # def MODEST_PORT_NAME
ifeq ($(MODEST_BUILD_OS),UNDEF)
MODEST_CFLAGS += -fPIC
MODEST_CFLAGS += -D_POSIX_C_SOURCE=199309L
MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE)
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY_WITH_VERSION_MAJOR) $(MODEST_UTILS_NEW_LINE)

View File

@ -1,6 +1,9 @@
modest_dirs := . finder style node layer render utils
modest_objs := $(call MODEST_UTILS_OBJS,modest,$(modest_dirs))
modest_description := calculating, compare, renderer
modest_dependencies := mycore mycss myencoding myfont myhtml myport myunicode myurl
modest_all: $(modest_objs)
modest_clean:

View File

@ -1,6 +1,9 @@
mycore_dirs := . utils
mycore_objs := $(call MODEST_UTILS_OBJS,mycore,$(mycore_dirs))
mycore_description := base module, it is used by all other modules
mycore_dependencies :=
mycore_all: $(mycore_objs)
mycore_clean:

View File

@ -1,6 +1,9 @@
mycss_dirs := . selectors namespace media values property declaration
mycss_objs := $(call MODEST_UTILS_OBJS,mycss,$(mycss_dirs))
mycss_description := CSS parser and modules by https://drafts.csswg.org/
mycss_dependencies := mycore myencoding myport
mycss_all: $(mycss_objs)
mycss_clean:

View File

@ -1,6 +1,9 @@
myencoding_dirs := .
myencoding_objs := $(call MODEST_UTILS_OBJS,myencoding,$(myencoding_dirs))
myencoding_description := work with character encodings, detecting encoding, convert encodings by https://encoding.spec.whatwg.org/
myencoding_dependencies := mycore myport
myencoding_all: $(myencoding_objs)
myencoding_clean:

View File

@ -1,6 +1,9 @@
myfont_dirs := .
myfont_objs := $(call MODEST_UTILS_OBJS,myfont,$(myfont_dirs))
myfont_description := work with font, metrics, calculating size and more by https://www.microsoft.com/en-us/Typography/SpecificationsOverview.aspx
myfont_dependencies := mycore myport
myfont_all: $(myfont_objs)
myfont_clean:

View File

@ -1,6 +1,9 @@
myhtml_dirs := .
myhtml_objs := $(call MODEST_UTILS_OBJS,myhtml,$(myhtml_dirs))
myhtml_description := HTML parser by https://html.spec.whatwg.org/multipage/
myhtml_dependencies := mycore myencoding myport
myhtml_all: $(myhtml_objs)
myhtml_clean:

View File

@ -8,7 +8,7 @@ ifeq ($(OS),Darwin)
MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed -i '.bak' -E 's/^[ \t]*\#[ \t]*include[ \t]*"([^"]+)"/\#include <\1>/g' {} \;
MODEST_CFLAGS += -fPIC
MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
LIB_NAME_SUFFIX := .dylib
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE)

View File

@ -5,7 +5,7 @@ ifeq ($(OS),Windows_NT)
LIB_NAME_SUFFIX := .dll
LIB_NAME_SUFFIX_STATIC := .dll.a
MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
MODEST_CFLAGS += -Wno-unused-variable -Wno-unused-function -std=c99
MODEST_LDFLAGS += -Wl,--out-implib,$(call MODEST_LIBRARY_STATIC)

View File

@ -1,6 +1,9 @@
myunicode_dirs := .
myunicode_objs := $(call MODEST_UTILS_OBJS,myunicode,$(myunicode_dirs))
myunicode_description := unicode normalization, case work and other
myunicode_dependencies := mycore myport
myunicode_all: $(myunicode_objs)
myunicode_clean:

View File

@ -1,6 +1,9 @@
myurl_dirs := .
myurl_objs := $(call MODEST_UTILS_OBJS,myurl,$(myurl_dirs))
myurl_description := URL parser by https://url.spec.whatwg.org/
myurl_dependencies := mycore myport
myurl_all: $(myurl_objs)
myurl_clean:

View File

@ -53,9 +53,9 @@ include $(BINARY_BUILD_MODULES_MAKEFILES_LIST)
#********************
# Set ARGS for flags
#***************
CFLAGS += $(BINARY_CFLAGS)
LDFLAGS += $(BINARY_LDFLAGS)
LDLIBS += $(BINARY_LIBRARIES)
override CFLAGS += $(BINARY_CFLAGS)
override LDFLAGS += $(BINARY_LDFLAGS)
override LDLIBS += $(BINARY_LIBRARIES)
#********************
# Objects

7
test/myhtml/Makefile.mk Normal file
View File

@ -0,0 +1,7 @@
myhtml_dirs := .
myhtml_objs := $(call BINARY_UTILS_OBJS,myhtml,$(myhtml_dirs))
myhtml_all: $(myhtml_objs)
myhtml_clean:
rm -f $(myhtml_objs)

294
test/myhtml/commoncrawl.c Normal file
View File

@ -0,0 +1,294 @@
/*
Copyright (C) 2016 Alexander Borisov
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Author: lex.borisov@gmail.com (Alexander Borisov)
For HTML Pages from: http://commoncrawl.org/
*/
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <myhtml/api.h>
#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
static myhtml_tree_t* global_tree;
#define total_count_size 20
static size_t total_count[total_count_size];
typedef void (*process_state_f)(const char* data, size_t filename_size);
typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
void print_total_count(void)
{
size_t total = 0;
for(size_t i = 0; i < 7; i++)
total += total_count[i];
printf("Total: %zu\n" ,total);
printf("\t0-100: %zu\n", total_count[0]);
printf("\t100-1000: %zu\n", total_count[1]);
printf("\t1000-5000: %zu\n", total_count[2]);
printf("\t5000-10000: %zu\n", total_count[3]);
printf("\t10000-50000: %zu\n", total_count[4]);
printf("\t50000-100000: %zu\n", total_count[5]);
printf("\t100000 and up: %zu\n", total_count[6]);
}
void listdir(const char *name, process_state_f callback)
{
memset(total_count, 0, sizeof(size_t) * total_count_size);
DIR *dir;
struct dirent *entry;
if(!(dir = opendir(name)))
return;
if(!(entry = readdir(dir)))
return;
do {
if(entry->d_type == DT_DIR) {
char path[2048];
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
path[len] = '\0';
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
listdir(path, callback);
}
else {
char path[2048];
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
path[len] = '\0';
if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
callback(path, len);
}
}
}
while ((entry = readdir(dir)));
closedir(dir);
}
void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
{
memset(total_count, 0, sizeof(size_t) * total_count_size);
FILE *fh = fopen(filename, "rb");
if(fh == NULL) {
fprintf(stderr, "Can't open html file: %s\n", filename);
exit(EXIT_FAILURE);
}
fseek(fh, 0L, SEEK_END);
long size = ftell(fh);
fseek(fh, 0L, SEEK_SET);
char *data = (char*)malloc(size + 1);
if(data == NULL) {
fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
exit(EXIT_FAILURE);
}
size_t nread = fread(data, 1, size, fh);
if (nread != size) {
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
exit(EXIT_FAILURE);
}
fclose(fh);
if(size < 0)
size = 0;
size_t from = 0;
char path[2048];
for(size_t i = 0; i < size; i++) {
if(data[i] == '\n') {
int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
path[len] = '\0';
callback(path, len);
from = i + 1;
}
}
free(data);
}
void process(const char* filename, size_t filename_size, parser_state_f parser)
{
FILE *fh = fopen(filename, "rb");
if(fh == NULL) {
fprintf(stderr, "Can't open html file: %s\n", filename);
exit(EXIT_FAILURE);
}
fseek(fh, 0L, SEEK_SET);
const char *ct = "Content-Length:";
size_t ct_size = strlen(ct);
char * line = NULL;
long get_size = 0;
ssize_t read = 0;
size_t count = 0, read_len = 0;
while ((read = getline(&line, &read_len, fh)) != -1) {
if(strncmp(ct, line, ct_size) == 0) {
size_t i;
for(i = ct_size; i < read_len; i++)
if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
break;
get_size = strtol(&line[i], NULL, 0);
}
else if(get_size && line[0] == '\r' && line[1] == '\n') {
long head_begin = ftell(fh) + 2;
long end = head_begin + get_size;
while ((read = getline(&line, &read_len, fh)) != -1) {
//printf("%.*s", (int)read_len, line);
if(line[0] == '\r' && line[1] == '\n')
break;
}
long head_end = ftell(fh);
size_t html_length = (end - head_end);
char *html = malloc(html_length + 1);
size_t nread = fread(html, 1, html_length, fh);
if (nread != html_length) {
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
exit(EXIT_FAILURE);
}
count++;
parser(html, html_length, count);
get_size = 0;
free(html);
}
}
fclose(fh);
}
void html_parser(const char* html, size_t html_length, size_t count)
{
if((count % 1000) == 0) {
printf("\t%zu\n", count);
}
myencoding_t encoding = 0;
//myhtml_encoding_detect(html, html_length, &encoding);
// parse html
myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
if(status != MyHTML_STATUS_OK) {
fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
exit(EXIT_FAILURE);
}
if(html_length < 100)
total_count[0]++;
else if(html_length >= 100 && html_length < 1000)
total_count[1]++;
else if(html_length >= 1000 && html_length < 5000)
total_count[2]++;
else if(html_length >= 5000 && html_length < 10000)
total_count[3]++;
else if(html_length >= 10000 && html_length < 50000)
total_count[4]++;
else if(html_length >= 50000 && html_length < 100000)
total_count[5]++;
else if(html_length >= 100000)
total_count[6]++;
//myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
}
void process_unpack(const char* filename, size_t filename_size)
{
char command[2048];
snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
printf("Unzip %s\n", filename);
system(command);
char new_path[2048];
size_t new_path_size = (filename_size - 3);
snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
printf("Process %s:\n", new_path);
process(new_path, new_path_size, html_parser);
printf("\n");
unlink(new_path);
}
static void usage(void)
{
fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
}
int main(int argc, const char * argv[])
{
if (argc != 2) {
usage();
return 0;
}
// basic init
myhtml_t* myhtml = myhtml_create();
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
// first tree init
global_tree = myhtml_tree_create();
myhtml_tree_init(global_tree, myhtml);
listdir(argv[1], process_unpack);
// release resources
myhtml_tree_destroy(global_tree);
myhtml_destroy(myhtml);
print_total_count();
return 0;
}