From 296072673897ceae2916bc4ca68239831ec8faa8 Mon Sep 17 00:00:00 2001 From: lexborisov Date: Mon, 20 Mar 2017 21:57:29 +0300 Subject: [PATCH] Added new rule for make (make modules) --- INSTALL.md | 76 ++++---- Makefile | 40 ++-- Makefile.cfg | 20 +- source/modest/Makefile.mk | 3 + source/mycore/Makefile.mk | 3 + source/mycss/Makefile.mk | 3 + source/myencoding/Makefile.mk | 3 + source/myfont/Makefile.mk | 3 + source/myhtml/Makefile.mk | 3 + source/myport/posix/Rules.mk | 2 +- source/myport/windows_nt/Rules.mk | 2 +- source/myunicode/Makefile.mk | 3 + source/myurl/Makefile.mk | 3 + test/Makefile | 6 +- test/myhtml/Makefile.mk | 7 + test/myhtml/commoncrawl.c | 294 ++++++++++++++++++++++++++++++ 16 files changed, 401 insertions(+), 70 deletions(-) create mode 100644 test/myhtml/Makefile.mk create mode 100644 test/myhtml/commoncrawl.c diff --git a/INSTALL.md b/INSTALL.md index 9fe0d69..0464370 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,51 +1,47 @@ # Modest: Build and Installation -## make +## GNU Make -In root directory: +In root directory of project (`/`): ```bash make -``` - -If successful copy lib/* and include/* at the right place for you - -Flags that can be passed to make: -- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2 -- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO - -*for example* -```bash -make MODEST_BUILD_WITHOUT_THREADS=NO -``` - -and copy to the right place for you -```bash -cp lib/* /usr/local/lib -cp -r include/* /usr/local/include -``` - -## cmake - -In `project` directory: -```bash -cmake . -make +make test sudo make install ``` -Flags that can be passed to CMake: -- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2 -- `CMAKE_INSTALL_LIBDIR=lib` set path to install created library. Default: lib -- `MODEST_BUILD_SHARED=ON` build shared library. Default: ON -- `MODEST_BUILD_STATIC=ON` build static library. Default: ON -- `MODEST_INSTALL_HEADER=OFF` install header files. Default ON -- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO -- `MODEST_EXTERN_MALLOC=my_malloc_func` set extern malloc function. Default: UNDEFINED -- `MODEST_EXTERN_REALLOC=my_realloc_func` set extern realloc function. Default: UNDEFINED -- `MODEST_EXTERN_CALLOC=my_calloc_func` set extern calloc function. Default: UNDEFINED -- `MODEST_EXTERN_FREE=my_free_func` set extern free function. Default: UNDEFINED +Flags that can be passed to make: +- `prefix`, default /usr/local +- `OS`, if not defined try to get from "uname -s" +- `PROJECT_OPTIMIZATION_LEVEL`, default -O2 +- `MyCORE_BUILD_WITHOUT_THREADS`, YES or (NO or undefined), default undefined +- `MyCORE_BUILD_DEBUG`, YES or (NO or undefined), default undefined +- `MyCORE_WITH_PERF`, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems, +- `PROJECT_INSTALL_HEADER`, default "include" +- `PROJECT_INSTALL_LIBRARY`, default "lib" +- `PROJECT_INSTALL_WITHOUT_HEADERS`, YES or (NO or undefined), default undefined -*for example* +*for example*: ```bash -cmake . -DCMAKE_INSTALL_LIBDIR=lib64 -DMODEST_INSTALL_HEADER=ON +make -j4 prefix=/usr MyCORE_BUILD_WITHOUT_THREADS=YES +sudo make install +``` + +Makefile rules: +- `all` —- build all components (libraries, examples, tests) (default) +- `library` -- build only static and shared library +- `shared` -- build only shared library +- `static` -- build only static library +- `clean` -- clean up current build directory +- `clone` -- copy all headers from source to include directories and modify local include (`#include "..."`) to global (`#include <...>`) +- `clean_api` -- remove all headers from include directory +- `create` -- create directories for binary, libraries, tests +- `test` -- run all tests +- `modules` -- print modules name, description, dependencies +- `install` -- install libraries and headers on your system +- `uninstall` -- delete libraries and headers on your system +- `make-pc-file` -- create pkg-config file + +*for example*: +```bash +make shared ``` diff --git a/Makefile b/Makefile index efb5b41..504fd0d 100644 --- a/Makefile +++ b/Makefile @@ -15,18 +15,19 @@ CC ?= gcc # install -- install libraries and headers on your system # uninstall -- delete libraries and headers on your system # test -- run all tests +# modules -- print modules list: Module name, Description, Dependencies # make-pc-file -- create pkg-config file # # ARGS # prefix, default /usr/local # OS, if not defined try to get from "uname -s" -# MODEST_OPTIMIZATION_LEVEL, default -O2 +# PROJECT_OPTIMIZATION_LEVEL, default -O2 # MyCORE_BUILD_WITHOUT_THREADS, YES or (NO or undefined), default undefined # MyCORE_BUILD_DEBUG, YES or (NO or undefined), default undefined # MyCORE_WITH_PERF, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems, -# MODEST_INSTALL_HEADER, default "include" -# MODEST_INSTALL_LIBRARY, default "lib" -# MODEST_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined +# PROJECT_INSTALL_HEADER, default "include" +# PROJECT_INSTALL_LIBRARY, default "lib" +# PROJECT_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined # # If OS build rules not exists we try make library with POSIX threads @@ -76,6 +77,12 @@ MyPORT_SELECTED_PORT = myport/$(strip $(MODEST_PORT_NAME)) #*************** include $(MODEST_BUILD_MODULES_MAKEFILES_LIST) +#******************** +# Modules info +#*************** +MODEST_BUILD_MODULES_INFO_DEP = $(foreach dep,$(strip $($1_dependencies)), $(dep)) +MODEST_BUILD_MODULES_INFO := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),$(MODEST_UTILS_NEW_LINE)Module: $(name)$(MODEST_UTILS_NEW_LINE)Description: $($(name)_description)$(MODEST_UTILS_NEW_LINE)Dependencies:$(call MODEST_BUILD_MODULES_INFO_DEP,$(name))$(MODEST_UTILS_NEW_LINE)) + #******************** # Set ARGS for flags #*************** @@ -102,17 +109,17 @@ BUILD_SUB_DIRS := examples $(TEST_DIR) #******************** # Install #*************** -MODEST_INSTALL_LIBRARY := lib -MODEST_INSTALL_HEADER := include +PROJECT_INSTALL_LIBRARY := lib +PROJECT_INSTALL_HEADER := include -libdir ?= $(prefix)/$(MODEST_INSTALL_LIBRARY) -includedir ?= $(prefix)/$(MODEST_INSTALL_HEADER) +libdir ?= $(prefix)/$(PROJECT_INSTALL_LIBRARY) +includedir ?= $(prefix)/$(PROJECT_INSTALL_HEADER) -MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(MODEST_INSTALL_LIBRARY) +MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(PROJECT_INSTALL_LIBRARY) MODEST_INSTALL_COMMAND := $(MODEST_INSTALL_CREATE_DIR) $(MODEST_UTILS_NEW_LINE) cp -av $(LIB_DIR_BASE)/* $(libdir) -ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES) - MODEST_INSTALL_CREATE_DIR += $(prefix)/$(MODEST_INSTALL_HEADER) +ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES) + MODEST_INSTALL_CREATE_DIR += $(prefix)/$(PROJECT_INSTALL_HEADER) MODEST_INSTALL_COMMAND += $(MODEST_UTILS_NEW_LINE) cp -av $(INCLUDE_DIR_API)/* $(includedir) endif @@ -122,7 +129,7 @@ endif MODEST_UNINSTALL_MK_COMMAND := MODEST_UNINSTALL_FILE := uninstal.mk -ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES) +ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES) MODEST_UNINSTALL_HEADERS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),rm -rf $(includedir)/$(name) \$$(MODEST_UTILS_NEW_LINE)) endif @@ -138,11 +145,11 @@ MODEST_PKG_CONFIG_FILE := modest.pc MODEST_PKG_CONFIG_CFLAGS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),-I$\{includedir}/$(name)) MODEST_PKG_CONFIG_PROCESS = \ $(SED) \ --e 's,@version\@,$(MODEST_VERSION_STRING),g' \ +-e 's,@version\@,$(PROJECT_VERSION_STRING),g' \ -e 's,@prefix\@,$(prefix),g' \ -e 's,@exec_prefix\@,$(exec_prefix),g' \ --e 's,@libdir\@,$(MODEST_INSTALL_LIBRARY),g' \ --e 's,@includedir\@,$(MODEST_INSTALL_HEADER),g' \ +-e 's,@libdir\@,$(PROJECT_INSTALL_LIBRARY),g' \ +-e 's,@includedir\@,$(PROJECT_INSTALL_HEADER),g' \ -e 's,@cflags\@,$(MODEST_PKG_CONFIG_CFLAGS),g' \ -e 's,@libname\@,$(LIB_NAME),g' \ -e 's,@description\@,$(DESCRIPTION),g' \ @@ -193,4 +200,7 @@ test: library make-pc-file: $(call MODEST_PKG_CONFIG_PROCESS,$(MODEST_PKG_CONFIG_FILE).in, $(MODEST_PKG_CONFIG_FILE)) +modules: + $(info $(MODEST_BUILD_MODULES_INFO)) + .PHONY: all clean clone test $(MODEST_BUILD_MODULES_TARGET_ALL) diff --git a/Makefile.cfg b/Makefile.cfg index eb5e8be..92d425d 100644 --- a/Makefile.cfg +++ b/Makefile.cfg @@ -3,11 +3,11 @@ MODEST_BUILD_OS := UNDEF #******************** # Version #*************** -MODEST_VERSION_MAJOR := 0 -MODEST_VERSION_MINOR := 0 -MODEST_VERSION_PATCH := 6 +PROJECT_VERSION_MAJOR := 0 +PROJECT_VERSION_MINOR := 0 +PROJECT_VERSION_PATCH := 6 -MODEST_VERSION_STRING := $(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR).$(MODEST_VERSION_PATCH) +PROJECT_VERSION_STRING := $(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR).$(PROJECT_VERSION_PATCH) #******************** # Flags @@ -37,14 +37,14 @@ LIB_DIR_BASE := lib # for use actual variables like a LIB_NAME_SUFFIX MODEST_LIBRARY_NAME ?= lib$(LIB_NAME)$(LIB_NAME_SUFFIX) MODEST_LIBRARY_NAME_STATIC ?=lib$(LIB_NAME)$(LIB_NAME_SUFFIX_STATIC) -MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(MODEST_VERSION_STRING)$(LIB_NAME_SUFFIX) +MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(PROJECT_VERSION_STRING)$(LIB_NAME_SUFFIX) MODEST_LIBRARY ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME) MODEST_LIBRARY_STATIC ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_STATIC) MODEST_LIBRARY_WITH_VERSION = $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_WITH_VERSION) -MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX) -MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR)$(LIB_NAME_SUFFIX) +MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX) +MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR)$(LIB_NAME_SUFFIX) #******************** # Binaries @@ -75,7 +75,7 @@ SED ?= sed MODEST_DIR_SEPARATOR ?= / # flags -MODEST_OPTIMIZATION_LEVEL ?= -O2 +PROJECT_OPTIMIZATION_LEVEL ?= -O2 MODEST_CFLAGS += -I$(INCLUDE_DIR) MODEST_LDFLAGS += @@ -89,7 +89,7 @@ MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed # Set -D #*************** ifeq ($(MyCORE_BUILD_DEBUG),YES) - override MODEST_OPTIMIZATION_LEVEL := + override PROJECT_OPTIMIZATION_LEVEL := MODEST_CFLAGS += -g3 -ggdb3 -O0 -fno-omit-frame-pointer -DMyCORE_BUILD_DEBUG endif @@ -128,7 +128,7 @@ endif # def MODEST_PORT_NAME ifeq ($(MODEST_BUILD_OS),UNDEF) MODEST_CFLAGS += -fPIC MODEST_CFLAGS += -D_POSIX_C_SOURCE=199309L - MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99 + MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99 MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE) MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY_WITH_VERSION_MAJOR) $(MODEST_UTILS_NEW_LINE) diff --git a/source/modest/Makefile.mk b/source/modest/Makefile.mk index f2e12a1..0ab2e7f 100644 --- a/source/modest/Makefile.mk +++ b/source/modest/Makefile.mk @@ -1,6 +1,9 @@ modest_dirs := . finder style node layer render utils modest_objs := $(call MODEST_UTILS_OBJS,modest,$(modest_dirs)) +modest_description := calculating, compare, renderer +modest_dependencies := mycore mycss myencoding myfont myhtml myport myunicode myurl + modest_all: $(modest_objs) modest_clean: diff --git a/source/mycore/Makefile.mk b/source/mycore/Makefile.mk index 4156566..fbbd1c1 100644 --- a/source/mycore/Makefile.mk +++ b/source/mycore/Makefile.mk @@ -1,6 +1,9 @@ mycore_dirs := . utils mycore_objs := $(call MODEST_UTILS_OBJS,mycore,$(mycore_dirs)) +mycore_description := base module, it is used by all other modules +mycore_dependencies := + mycore_all: $(mycore_objs) mycore_clean: diff --git a/source/mycss/Makefile.mk b/source/mycss/Makefile.mk index 14e8478..51005aa 100644 --- a/source/mycss/Makefile.mk +++ b/source/mycss/Makefile.mk @@ -1,6 +1,9 @@ mycss_dirs := . selectors namespace media values property declaration mycss_objs := $(call MODEST_UTILS_OBJS,mycss,$(mycss_dirs)) +mycss_description := CSS parser and modules by https://drafts.csswg.org/ +mycss_dependencies := mycore myencoding myport + mycss_all: $(mycss_objs) mycss_clean: diff --git a/source/myencoding/Makefile.mk b/source/myencoding/Makefile.mk index 9f31378..d6194bc 100644 --- a/source/myencoding/Makefile.mk +++ b/source/myencoding/Makefile.mk @@ -1,6 +1,9 @@ myencoding_dirs := . myencoding_objs := $(call MODEST_UTILS_OBJS,myencoding,$(myencoding_dirs)) +myencoding_description := work with character encodings, detecting encoding, convert encodings by https://encoding.spec.whatwg.org/ +myencoding_dependencies := mycore myport + myencoding_all: $(myencoding_objs) myencoding_clean: diff --git a/source/myfont/Makefile.mk b/source/myfont/Makefile.mk index b8cdffc..91ea339 100644 --- a/source/myfont/Makefile.mk +++ b/source/myfont/Makefile.mk @@ -1,6 +1,9 @@ myfont_dirs := . myfont_objs := $(call MODEST_UTILS_OBJS,myfont,$(myfont_dirs)) +myfont_description := work with font, metrics, calculating size and more by https://www.microsoft.com/en-us/Typography/SpecificationsOverview.aspx +myfont_dependencies := mycore myport + myfont_all: $(myfont_objs) myfont_clean: diff --git a/source/myhtml/Makefile.mk b/source/myhtml/Makefile.mk index e67c236..b4b30cb 100644 --- a/source/myhtml/Makefile.mk +++ b/source/myhtml/Makefile.mk @@ -1,6 +1,9 @@ myhtml_dirs := . myhtml_objs := $(call MODEST_UTILS_OBJS,myhtml,$(myhtml_dirs)) +myhtml_description := HTML parser by https://html.spec.whatwg.org/multipage/ +myhtml_dependencies := mycore myencoding myport + myhtml_all: $(myhtml_objs) myhtml_clean: diff --git a/source/myport/posix/Rules.mk b/source/myport/posix/Rules.mk index 4ca0292..7684fb7 100644 --- a/source/myport/posix/Rules.mk +++ b/source/myport/posix/Rules.mk @@ -8,7 +8,7 @@ ifeq ($(OS),Darwin) MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed -i '.bak' -E 's/^[ \t]*\#[ \t]*include[ \t]*"([^"]+)"/\#include <\1>/g' {} \; MODEST_CFLAGS += -fPIC - MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99 + MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99 LIB_NAME_SUFFIX := .dylib MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE) diff --git a/source/myport/windows_nt/Rules.mk b/source/myport/windows_nt/Rules.mk index a122cb6..f802892 100644 --- a/source/myport/windows_nt/Rules.mk +++ b/source/myport/windows_nt/Rules.mk @@ -5,7 +5,7 @@ ifeq ($(OS),Windows_NT) LIB_NAME_SUFFIX := .dll LIB_NAME_SUFFIX_STATIC := .dll.a - MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX) + MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX) MODEST_CFLAGS += -Wno-unused-variable -Wno-unused-function -std=c99 MODEST_LDFLAGS += -Wl,--out-implib,$(call MODEST_LIBRARY_STATIC) diff --git a/source/myunicode/Makefile.mk b/source/myunicode/Makefile.mk index 35d955b..e219cf0 100644 --- a/source/myunicode/Makefile.mk +++ b/source/myunicode/Makefile.mk @@ -1,6 +1,9 @@ myunicode_dirs := . myunicode_objs := $(call MODEST_UTILS_OBJS,myunicode,$(myunicode_dirs)) +myunicode_description := unicode normalization, case work and other +myunicode_dependencies := mycore myport + myunicode_all: $(myunicode_objs) myunicode_clean: diff --git a/source/myurl/Makefile.mk b/source/myurl/Makefile.mk index 6b6cfee..079aa5e 100644 --- a/source/myurl/Makefile.mk +++ b/source/myurl/Makefile.mk @@ -1,6 +1,9 @@ myurl_dirs := . myurl_objs := $(call MODEST_UTILS_OBJS,myurl,$(myurl_dirs)) +myurl_description := URL parser by https://url.spec.whatwg.org/ +myurl_dependencies := mycore myport + myurl_all: $(myurl_objs) myurl_clean: diff --git a/test/Makefile b/test/Makefile index 7431918..5bbb11a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -53,9 +53,9 @@ include $(BINARY_BUILD_MODULES_MAKEFILES_LIST) #******************** # Set ARGS for flags #*************** -CFLAGS += $(BINARY_CFLAGS) -LDFLAGS += $(BINARY_LDFLAGS) -LDLIBS += $(BINARY_LIBRARIES) +override CFLAGS += $(BINARY_CFLAGS) +override LDFLAGS += $(BINARY_LDFLAGS) +override LDLIBS += $(BINARY_LIBRARIES) #******************** # Objects diff --git a/test/myhtml/Makefile.mk b/test/myhtml/Makefile.mk new file mode 100644 index 0000000..f6fc0c7 --- /dev/null +++ b/test/myhtml/Makefile.mk @@ -0,0 +1,7 @@ +myhtml_dirs := . +myhtml_objs := $(call BINARY_UTILS_OBJS,myhtml,$(myhtml_dirs)) + +myhtml_all: $(myhtml_objs) + +myhtml_clean: + rm -f $(myhtml_objs) diff --git a/test/myhtml/commoncrawl.c b/test/myhtml/commoncrawl.c new file mode 100644 index 0000000..847ec6f --- /dev/null +++ b/test/myhtml/commoncrawl.c @@ -0,0 +1,294 @@ +/* + Copyright (C) 2016 Alexander Borisov + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + Author: lex.borisov@gmail.com (Alexander Borisov) + For HTML Pages from: http://commoncrawl.org/ +*/ + +#include +#include +#include +#include +#include + +#include + +#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0) + +static myhtml_tree_t* global_tree; + +#define total_count_size 20 +static size_t total_count[total_count_size]; + +typedef void (*process_state_f)(const char* data, size_t filename_size); +typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count); + +void print_total_count(void) +{ + size_t total = 0; + for(size_t i = 0; i < 7; i++) + total += total_count[i]; + + printf("Total: %zu\n" ,total); + + printf("\t0-100: %zu\n", total_count[0]); + printf("\t100-1000: %zu\n", total_count[1]); + printf("\t1000-5000: %zu\n", total_count[2]); + printf("\t5000-10000: %zu\n", total_count[3]); + printf("\t10000-50000: %zu\n", total_count[4]); + printf("\t50000-100000: %zu\n", total_count[5]); + printf("\t100000 and up: %zu\n", total_count[6]); +} + +void listdir(const char *name, process_state_f callback) +{ + memset(total_count, 0, sizeof(size_t) * total_count_size); + + DIR *dir; + struct dirent *entry; + + if(!(dir = opendir(name))) + return; + if(!(entry = readdir(dir))) + return; + + do { + if(entry->d_type == DT_DIR) { + char path[2048]; + + int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name); + path[len] = '\0'; + + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; + + listdir(path, callback); + } + else { + char path[2048]; + + int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name); + path[len] = '\0'; + + if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') { + callback(path, len); + } + } + } + while ((entry = readdir(dir))); + + closedir(dir); +} + +void read_loaded(const char *filename, const char *db_dir, process_state_f callback) +{ + memset(total_count, 0, sizeof(size_t) * total_count_size); + + FILE *fh = fopen(filename, "rb"); + if(fh == NULL) { + fprintf(stderr, "Can't open html file: %s\n", filename); + exit(EXIT_FAILURE); + } + + fseek(fh, 0L, SEEK_END); + long size = ftell(fh); + fseek(fh, 0L, SEEK_SET); + + char *data = (char*)malloc(size + 1); + if(data == NULL) { + fprintf(stderr, "Can't allocate mem for html file: %s\n", filename); + exit(EXIT_FAILURE); + } + + size_t nread = fread(data, 1, size, fh); + if (nread != size) { + fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread); + exit(EXIT_FAILURE); + } + + fclose(fh); + + if(size < 0) + size = 0; + + size_t from = 0; + char path[2048]; + + for(size_t i = 0; i < size; i++) { + if(data[i] == '\n') { + int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]); + path[len] = '\0'; + + callback(path, len); + + from = i + 1; + } + } + + free(data); +} + +void process(const char* filename, size_t filename_size, parser_state_f parser) +{ + FILE *fh = fopen(filename, "rb"); + if(fh == NULL) { + fprintf(stderr, "Can't open html file: %s\n", filename); + exit(EXIT_FAILURE); + } + + fseek(fh, 0L, SEEK_SET); + + const char *ct = "Content-Length:"; + size_t ct_size = strlen(ct); + + char * line = NULL; + long get_size = 0; + ssize_t read = 0; + + size_t count = 0, read_len = 0; + + while ((read = getline(&line, &read_len, fh)) != -1) { + + if(strncmp(ct, line, ct_size) == 0) { + size_t i; + + for(i = ct_size; i < read_len; i++) + if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ') + break; + + get_size = strtol(&line[i], NULL, 0); + } + else if(get_size && line[0] == '\r' && line[1] == '\n') { + long head_begin = ftell(fh) + 2; + long end = head_begin + get_size; + + while ((read = getline(&line, &read_len, fh)) != -1) { + //printf("%.*s", (int)read_len, line); + + if(line[0] == '\r' && line[1] == '\n') + break; + } + + long head_end = ftell(fh); + + size_t html_length = (end - head_end); + char *html = malloc(html_length + 1); + + size_t nread = fread(html, 1, html_length, fh); + if (nread != html_length) { + fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread); + exit(EXIT_FAILURE); + } + + count++; + parser(html, html_length, count); + + get_size = 0; + free(html); + } + } + + fclose(fh); +} + +void html_parser(const char* html, size_t html_length, size_t count) +{ + if((count % 1000) == 0) { + printf("\t%zu\n", count); + } + + myencoding_t encoding = 0; + //myhtml_encoding_detect(html, html_length, &encoding); + + // parse html + myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length); + if(status != MyHTML_STATUS_OK) { + fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html); + exit(EXIT_FAILURE); + } + + if(html_length < 100) + total_count[0]++; + else if(html_length >= 100 && html_length < 1000) + total_count[1]++; + else if(html_length >= 1000 && html_length < 5000) + total_count[2]++; + else if(html_length >= 5000 && html_length < 10000) + total_count[3]++; + else if(html_length >= 10000 && html_length < 50000) + total_count[4]++; + else if(html_length >= 50000 && html_length < 100000) + total_count[5]++; + else if(html_length >= 100000) + total_count[6]++; + + //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0); +} + +void process_unpack(const char* filename, size_t filename_size) +{ + char command[2048]; + snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename); + + printf("Unzip %s\n", filename); + + system(command); + + char new_path[2048]; + size_t new_path_size = (filename_size - 3); + + snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename); + + printf("Process %s:\n", new_path); + process(new_path, new_path_size, html_parser); + printf("\n"); + + unlink(new_path); +} + +static void usage(void) +{ + fprintf(stderr, "commoncrawl \n"); +} + +int main(int argc, const char * argv[]) +{ + if (argc != 2) { + usage(); + return 0; + } + + // basic init + myhtml_t* myhtml = myhtml_create(); + myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); + + // first tree init + global_tree = myhtml_tree_create(); + myhtml_tree_init(global_tree, myhtml); + + listdir(argv[1], process_unpack); + + // release resources + myhtml_tree_destroy(global_tree); + myhtml_destroy(myhtml); + + print_total_count(); + + return 0; +} + +