From 9ee3007853cca6338afbec8c5603ffd6d94f46ca Mon Sep 17 00:00:00 2001 From: Mariano Scasso <75589700+mscasso-scanoss@users.noreply.github.com> Date: Tue, 27 Feb 2024 10:42:09 -0300 Subject: [PATCH] 2.6.0 (#59) * removed ldb duplicated code. * solve memory segfault during attribution mining. * update Makefile. * remove md5 calc duplicated functions. --- Makefile | 22 ++- external/src/winnowing.c | 1 - inc/crc32c.h | 2 +- inc/file.h | 2 - inc/import.h | 4 +- inc/join.h | 7 - inc/md5.h | 9 -- inc/minr.h | 2 +- inc/minr_log.h | 3 +- inc/mz.h | 7 - src/attributions.c | 2 +- src/file.c | 1 - src/import.c | 1 - src/main.c | 3 +- src/md5.c | 80 ---------- src/minr.c | 7 +- src/minr_log.c | 8 +- src/mz_deflate.c | 330 --------------------------------------- src/mz_main.c | 7 +- src/mz_optimise.c | 20 +-- src/url.c | 1 - src/wfp.c | 1 - 22 files changed, 32 insertions(+), 488 deletions(-) delete mode 100644 inc/join.h delete mode 100644 inc/md5.h delete mode 100644 src/md5.c delete mode 100644 src/mz_deflate.c diff --git a/Makefile b/Makefile index 425bead..6be133a 100644 --- a/Makefile +++ b/Makefile @@ -6,19 +6,11 @@ CCFLAGS?=-g -Wall -I./inc -I./external/inc -D_LARGEFILE64_SOURCE -D_GNU_SOURCE # Linker flags LDFLAGS=-lz -lldb -lpthread -ldl -LDB_CURRENT_VERSION := $(shell ldb -v | sed 's/ldb-//' | head -c 3) -LDB_TARGET_VERSION := 3.2 - -VERSION_IS_LESS := $(shell echo $(LDB_CURRENT_VERSION) \< $(LDB_TARGET_VERSION) | bc) -ifeq ($(VERSION_IS_LESS),1) - LDFLAGS += -lcrypto -endif - BUILD_DIR =build SOURCES=$(wildcard src/*.c) $(wildcard src/**/*.c) $(wildcard external/*.c) $(wildcard external/**/*.c) SOURCES_MINR=$(filter-out src/mz_main.c, $(SOURCES)) -OBJECTS_MIRN=$(SOURCES_MINR:.c=.o) +OBJECTS_MINR=$(SOURCES_MINR:.c=.o) SOURCES_MZ=$(filter-out src/main.c, $(SOURCES)) OBJECTS_MZ=$(SOURCES_MZ:.c=.o) @@ -28,11 +20,17 @@ TARGET_MZ=mz VERSION=$(shell ./version.sh) +LDB_CURRENT_VERSION := $(shell ldb -v | sed 's/ldb-//' | head -c 3) +LDB_TARGET_VERSION := 4.1 +VERSION_IS_LESS := $(shell echo $(LDB_CURRENT_VERSION) \< $(LDB_TARGET_VERSION) | bc) + all: clean $(TARGET_MINR) $(TARGET_MZ) -$(TARGET_MINR): $(OBJECTS_MIRN) - @echo "Current version: $(LDB_CURRENT_VERSION)" - @echo "LDFLAGS: $(LDFLAGS)" +$(TARGET_MINR): $(OBJECTS_MINR) +ifeq ($(VERSION_IS_LESS),1) + @echo "Current LDB version: $(LDB_CURRENT_VERSION) is too old, please update to the lastest version to continue." + exit 1 +endif $(CC) -o $@ $^ $(LDFLAGS) $(TARGET_MZ): $(OBJECTS_MZ) diff --git a/external/src/winnowing.c b/external/src/winnowing.c index e7f53fa..2888b2e 100644 --- a/external/src/winnowing.c +++ b/external/src/winnowing.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "crc32c.h" #include "winnowing.h" diff --git a/inc/crc32c.h b/inc/crc32c.h index dd74c46..2c452d7 100644 --- a/inc/crc32c.h +++ b/inc/crc32c.h @@ -1,6 +1,6 @@ #ifndef __CRC32C_H #define __CRC32C_H - +#include uint32_t calc_crc32c (char *data, size_t len); #endif \ No newline at end of file diff --git a/inc/file.h b/inc/file.h index 95d9030..1ebb323 100644 --- a/inc/file.h +++ b/inc/file.h @@ -5,8 +5,6 @@ #include #include - -//void file_md5(char *filepath, uint8_t *md5_result); uint64_t get_file_size(char *path); void read_file(char *out, char *path, uint64_t maxlen); bool is_file(char *path); diff --git a/inc/import.h b/inc/import.h index 18d999b..094faed 100644 --- a/inc/import.h +++ b/inc/import.h @@ -1,5 +1,5 @@ -#ifndef __IMPORT_H -#define __IMPORT_H +#ifndef __MINR_IMPORT_H +#define __MINR_IMPORT_H #include diff --git a/inc/join.h b/inc/join.h deleted file mode 100644 index 042bed6..0000000 --- a/inc/join.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __IMPORT_H -#define __IMPORT_H - -void minr_join_mz(char *source, char *destination); - -#endif - diff --git a/inc/md5.h b/inc/md5.h deleted file mode 100644 index b7a37a2..0000000 --- a/inc/md5.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __MD5_H -#define __MD5_H - -#include -#ifndef MD5 -uint8_t *file_md5 (char *path); -void calc_md5(char *data, int size, uint8_t *out); -#endif -#endif \ No newline at end of file diff --git a/inc/minr.h b/inc/minr.h index 55a1e85..bbabc65 100644 --- a/inc/minr.h +++ b/inc/minr.h @@ -29,7 +29,7 @@ #include /* Definitions */ -#define MINR_VERSION "2.5.20" +#define MINR_VERSION "2.6.0" #define FILE_FILES 256 #define MAX_ARG_LEN 1024 #define MIN_FILE_REC_LEN 70 diff --git a/inc/minr_log.h b/inc/minr_log.h index 1e48c60..98da34c 100644 --- a/inc/minr_log.h +++ b/inc/minr_log.h @@ -2,7 +2,6 @@ #define __MINR_LOG_H #include -extern char log_file[FILENAME_MAX]; void minr_log(const char *fmt, ...); - +void minr_log_path(char * path); #endif \ No newline at end of file diff --git a/inc/mz.h b/inc/mz.h index 48090e2..2652d9d 100644 --- a/inc/mz.h +++ b/inc/mz.h @@ -14,12 +14,5 @@ typedef enum void mz_optimise(struct mz_job *job, mz_optimise_mode_t mode); void mz_extract(struct mz_job *job); -void mz_list(struct mz_job *job); - -#ifndef MZ_DEFLATE -#define MZ_DEFLATE_LOCAL -void mz_deflate2(struct mz_job *job); -#define MZ_DEFLATE(job) mz_deflate2(job) -#endif #endif \ No newline at end of file diff --git a/src/attributions.c b/src/attributions.c index 8afb37c..97ba163 100644 --- a/src/attributions.c +++ b/src/attributions.c @@ -239,7 +239,7 @@ void mine_attribution_notice(struct minr_job *job, char *path) /* Compress data */ job->zsrc_ln = compressBound(job->src_ln + 1); - job->zsrc =calloc((job->zsrc_ln + 1), 1); + job->zsrc =calloc((job->zsrc_ln + 128), 1); /* Save the first bytes of zsrc to accomodate the MZ header */ compress(job->zsrc + MZ_HEAD, &job->zsrc_ln, (uint8_t *)job->src, job->src_ln + 1); diff --git a/src/file.c b/src/file.c index 46e433b..565b41c 100644 --- a/src/file.c +++ b/src/file.c @@ -27,7 +27,6 @@ */ #include -#include #include #include #include diff --git a/src/import.c b/src/import.c index d2792b8..88c7e80 100644 --- a/src/import.c +++ b/src/import.c @@ -38,7 +38,6 @@ #include "file.h" #include "hex.h" #include "ignorelist.h" -#include "join.h" #include "minr_log.h" diff --git a/src/main.c b/src/main.c index 53d7361..43ed628 100644 --- a/src/main.c +++ b/src/main.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -261,7 +260,7 @@ int main(int argc, char *argv[]) strcpy(job.metadata, optarg); break; case 'V': - strcpy(log_file, optarg); + minr_log_path(optarg); break; case 's': job.skip_sort = true; diff --git a/src/md5.c b/src/md5.c deleted file mode 100644 index 3c20ed2..0000000 --- a/src/md5.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * src/md5.c - * - * MD5 calculation - * - * Copyright (C) 2018-2021 SCANOSS.COM - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or - * (at your option) any later version. - - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -/* Returns the hexadecimal md5 sum for "path" */ -#include "ldb.h" - -#ifndef MD5 -/** - * @file md5.c - * @date 7 Feb 2021 - * @brief Implement MD5 calculation - */ -#include - -#include "minr.h" -#include "md5.h" - -/** - * @brief Calculate the md5 sum of a file - * - * @param path string path - * @return uint8_t* pointer to md5 - */ -#define BUFFER_SIZE 1048576 - -uint8_t *file_md5 (char *path) -{ - uint8_t *c = calloc(16,1); - FILE *fp = fopen(path, "rb"); - MD5_CTX mdContext; - uint32_t bytes; - - if (fp != NULL) - { - uint8_t *buffer = malloc(BUFFER_SIZE); - MD5_Init (&mdContext); - - while ((bytes = fread(buffer, 1, BUFFER_SIZE, fp)) != 0) - MD5_Update(&mdContext, buffer, bytes); - - MD5_Final(c, &mdContext); - fclose(fp); - free(buffer); - } - return c; -} -/** - * @brief Calculate the md5 sum of a data set - * - * @param data pointer to data - * @return uint8_t* pointer to md5 - */ - -void calc_md5(char *data, int size, uint8_t *out) -{ - MD5_CTX mdContext; - MD5_Init (&mdContext); - MD5_Update(&mdContext, data, size); - MD5_Final(out, &mdContext); -} -#endif \ No newline at end of file diff --git a/src/minr.c b/src/minr.c index 8eb74a7..ffdd24d 100644 --- a/src/minr.c +++ b/src/minr.c @@ -33,11 +33,10 @@ #include #include "attributions.h" #include "file.h" -#include "md5.h" #include "hex.h" #include "ignorelist.h" #include "ignored_files.h" -#include "ldb.h" +#include #include "crypto.h" #include "minr_log.h" @@ -171,7 +170,7 @@ char *downloaded_file(char *tmp_dir) */ void load_urlid(struct minr_job *job, char *tmp_file) { - uint8_t *bin_md5 = file_md5(tmp_file); + uint8_t *bin_md5 = md5_file(tmp_file); char *hex_md5 = bin_to_hex(bin_md5, 16); strcpy(job->urlid, hex_md5); free(hex_md5); @@ -334,7 +333,7 @@ int load_file(struct minr_job *job, char *path) fclose(fp); /* Calculate file MD5 */ - uint8_t * md5 = file_md5(path); + uint8_t * md5 = md5_file(path); memcpy(job->md5, md5, sizeof(job->md5)); free(md5); ldb_bin_to_hex(job->md5, MD5_LEN, job->fileid); diff --git a/src/minr_log.c b/src/minr_log.c index c2c0362..b7dacb6 100644 --- a/src/minr_log.c +++ b/src/minr_log.c @@ -1,8 +1,14 @@ #include #include +#include #include "minr_log.h" -char log_file[FILENAME_MAX] = "\0"; +static char log_file[FILENAME_MAX] = "\0"; + +void minr_log_path(char * path) +{ + strcpy(log_file, path); +} /** * @brief Print the logs in stderr diff --git a/src/mz_deflate.c b/src/mz_deflate.c deleted file mode 100644 index 9d8ede5..0000000 --- a/src/mz_deflate.c +++ /dev/null @@ -1,330 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * src/mz_deflate.c - * - * MZ decompression, validation and listing functions - * - * Copyright (C) 2018-2021 SCANOSS.COM - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or - * (at your option) any later version. - - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -/** - * @file mz_deflate.c - * @date 26 Oct 2021 - * @brief - -#include -#include "mz.h" -#include "minr.h" -#include "md5.h" -#include "hex.h" -#include - - -#ifdef MZ_DEFLATE_LOCAL -/* This code is here to provide backward compatibility, this is duplicated in the newers versions of ldb*/ -#define CHUNK_SIZE 1024 - -int uncompress_by_chunks(uint8_t **data, uint8_t *zdata, size_t zdata_len) { - int ret; - z_stream strm; - unsigned char out[CHUNK_SIZE]; - size_t data_size = 0; // Current size of decompressed data - - // Initialize the z_stream structure - memset(&strm, 0, sizeof(strm)); - ret = inflateInit(&strm); - if (ret != Z_OK) { - fprintf(stderr, "inflateInit failed with error %d\n", ret); - exit(EXIT_FAILURE); - } - *data = malloc(CHUNK_SIZE); - // Process the compressed data - strm.avail_in = zdata_len; // Size of the compressed data - strm.next_in = zdata; - - do { - strm.avail_out = CHUNK_SIZE; - strm.next_out = out; - - ret = inflate(&strm, Z_NO_FLUSH); - if (ret == Z_STREAM_ERROR) { - fprintf(stderr, "inflate failed with error Z_STREAM_ERROR\n"); - inflateEnd(&strm); - mz_corrupted(); - } - - unsigned have = CHUNK_SIZE - strm.avail_out; - - // Realloc to increase the size of data - *data = realloc(*data, data_size + have); - if (*data == NULL) - { - fprintf(stderr, "Error reallocating memory to store decompressed data"); - inflateEnd(&strm); - exit(EXIT_FAILURE); - } - - // Copy the decompressed data to the end of data - memcpy(*data + data_size, out, have); - data_size += have; - } while (ret != Z_STREAM_END); - - // Free resources - inflateEnd(&strm); - return data_size; -} - -void mz_deflate2(struct mz_job *job) -{ - /* Decompress data */ - job->data_ln = uncompress_by_chunks((uint8_t **) &job->data, job->zdata, job->zdata_ln); - job->data_ln--; -} - -#endif - -/** - * @brief Compare two mz keys - * - * @param a key a - * @param b key b - * @return 1 if a > b. -1 if b < a, 0 if they are equals - */ -int mz_key_cmp(const void * a, const void * b) -{ - const uint8_t *va = a; - const uint8_t *vb = b; - - /* Compare byte by byte */ - for (int i = 0; i < MD5_LEN; i++) - { - if (va[i] > vb[i]) return 1; - if (va[i] < vb[i]) return -1; - } - - return 0; -} - -/** - * @brief Handling function for listing mz keys - * - * @param job pointer to mz job - * @return true - */ -bool mz_dump_keys_handler(struct mz_job *job) -{ - /* Fill MD5 with item id */ - mz_id_fill(job->md5, job->id); - - ldb_hex_to_bin(job->md5, MD5_LEN * 2, job->ptr + job->ptr_ln); - job->ptr_ln += MD5_LEN; - - return true; -} - -/** - * @brief Output unique mz keys to STDOUT (binary) - * - * @param job pointer to mz job - */ -void mz_dump_keys(struct mz_job *job) -{ - /* Use job->ptr to store keys */ - job->ptr = malloc(job->mz_ln); - - /* Fetch keys */ - mz_parse(job, mz_dump_keys_handler); - - /* Sort keys */ - qsort(job->ptr, job->ptr_ln / MD5_LEN, MD5_LEN, mz_key_cmp); - - /* Output keys */ - for (int i = 0; i < job->ptr_ln; i += 16) - { - bool skip = false; - if (i) if (!memcmp(job->ptr + i, job->ptr + i - MD5_LEN, MD5_LEN)) - { - skip = true; - } - if (!skip) fwrite(job->ptr + i, MD5_LEN, 1, stdout); - } - - free(job->ptr); -} - -/** - * @brief Handling function for listing mz contents - * - * @param job - * @return true - */ -bool mz_list_handler(struct mz_job *job) -{ - /* Fill MD5 with item id */ - mz_id_fill(job->md5, job->id); - - /* Decompress */ - MZ_DEFLATE(job); - - /* Calculate resulting data MD5 */ - uint8_t actual_md5[MD5_LEN]; - MD5((unsigned char*) job->data, job->data_ln, actual_md5); - - /* Compare data checksum to validate */ - char *actual = bin_to_hex(actual_md5, MD5_LEN); - - if (strcmp(job->md5, actual)) - { - printf("%s [NOK] %lu bytes\n", job->md5, job->data_ln); - } - else if (!job->check_only) - { - printf("%s [OK] %lu bytes\n", job->md5, job->data_ln); - } - free(job->data); - free(actual); - return true; -} - -/** - * @brief List the content of a mz file - * - * @param job pointer to mz job - */ -void mz_list(struct mz_job *job) -{ - /* Extract first two MD5 bytes from the file name */ - memcpy(job->md5, basename(job->path), 4); - - /* Read source mz file into memory */ - job->mz = file_read(job->path, &job->mz_ln); - - /* List mz contents */ - if (!job->dump_keys) mz_parse(job, mz_list_handler); - - /* Dump mz keys */ - else mz_dump_keys(job); - - free(job->mz); -} - -/** - * @brief Decompress and print a mz file (handler) - * - * @param job pointer to mz job - * @return true - * @return false - */ -bool mz_cat_handler(struct mz_job *job) -{ - if (!memcmp(job->id, job->key + 2, MZ_MD5)) - { - /* Decompress */ - MZ_DEFLATE(job); - - job->data[job->data_ln] = 0; - printf("%s", job->data); - free(job->data); - return false; - } - return true; -} - -/** - * @brief Decompress and print a mz file - * - * @param job pointer to mz job - * @param key key to be found - */ -void mz_cat(struct mz_job *job, char *key) -{ - /* Calculate mz file path */ - char mz_path[MAX_PATH_LEN] = "\0"; - char mz_file_id[5] = "\0\0\0\0\0"; - memcpy(mz_file_id, key, 4); - - sprintf(mz_path, "%s/%s.mz", job->path, mz_file_id); - - /* Save path and key on job */ - job->key = calloc(MD5_LEN, 1); - ldb_hex_to_bin(key, MD5_LEN * 2, job->key); - - /* Read source mz file into memory */ - job->mz = file_read(mz_path, &job->mz_ln); - - /* Search and display "key" file contents */ - mz_parse(job, mz_cat_handler); - - free(job->key); - free(job->mz); -} - -/** - * @brief Handler for mz extraction - * - * @param job pointer to mz job - * @return true - */ -bool mz_extract_handler(struct mz_job *job) -{ - /* Fill MD5 with item id */ - mz_id_fill(job->md5, job->id); - - /* Decompress */ - MZ_DEFLATE(job); - - /* Calculate resulting data MD5 */ - uint8_t actual_md5[MD5_LEN]; - MD5((unsigned char*) job->data, job->data_ln, actual_md5); - job->data[job->data_ln] = 0; - /* Compare data checksum to validate */ - char *actual = bin_to_hex(actual_md5, MD5_LEN); - /* Extract data to file */ - file_write(job->md5, (uint8_t *)job->data, job->data_ln); - printf("Extracting %s (%lu bytes)\n", job->md5, job->data_ln); - - if (strcmp(job->md5, actual)) - { - fprintf(stderr, "Warning uncompressed file MD5 does not match\n"); - } - - free(actual); - //free(job->mz); - free(job->data); - - return true; -} - -/** - * @brief Extract the content of a mz file - * - * @param job pointer to mz job - */ -void mz_extract(struct mz_job *job) -{ - /* Extract first two MD5 bytes from the file name */ - memcpy(job->md5, basename(job->path), 4); - - /* Read source mz file into memory */ - job->mz = file_read(job->path, &job->mz_ln); - - /* Launch extraction */ - mz_parse(job, mz_extract_handler); - free(job->mz); -} diff --git a/src/mz_main.c b/src/mz_main.c index b289835..edbcbc8 100644 --- a/src/mz_main.c +++ b/src/mz_main.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -206,7 +205,7 @@ int main(int argc, char *argv[]) case 'c': job.check_only = true; argcpy(job.path, optarg); - mz_list(&job); + mz_list_check(&job); break; case 'x': @@ -217,12 +216,12 @@ int main(int argc, char *argv[]) case 'K': job.dump_keys = true; argcpy(job.path, optarg); - mz_list(&job); + mz_list_check(&job); break; case 'l': argcpy(job.path, optarg); - mz_list(&job); + mz_list_check(&job); break; case 'C': diff --git a/src/mz_optimise.c b/src/mz_optimise.c index ea6e489..ad59ca1 100644 --- a/src/mz_optimise.c +++ b/src/mz_optimise.c @@ -108,17 +108,9 @@ bool mz_md5_match(uint8_t *mz1, uint8_t *mz2) * @param job pointer to mz job * @return true */ -bool mz_optimise_handler(struct mz_job *job) +static bool mz_optimise_handler(struct mz_job *job) { /* Uncompress */ - /*uint64_t src_ln = MAX_FILE_SIZE; - if (Z_OK != uncompress((uint8_t *)job->data, &src_ln, job->zdata, job->zdata_ln)) - { - printf("[DECOMPRESS FAILED] "); - ldb_hexprint(job->id, 14, 14); - return true; - } - job->data_ln = src_ln - 1;*/ MZ_DEFLATE(job); job->data[job->data_ln] = 0; @@ -176,17 +168,9 @@ bool mz_optimise_handler(struct mz_job *job) return true; } -bool mz_optimise_dup_handler(struct mz_job *job) +static bool mz_optimise_dup_handler(struct mz_job *job) { /* Uncompress */ -/* uint64_t src_ln = MAX_FILE_SIZE; - if (Z_OK != uncompress((uint8_t *)job->data, &src_ln, job->zdata, job->zdata_ln)) - { - printf("[DECOMPRESS FAILED] "); - ldb_hexprint(job->id, 14, 14); - return true; - } - job->data_ln = src_ln - 1;*/ MZ_DEFLATE(job); job->data[job->data_ln] = 0; diff --git a/src/url.c b/src/url.c index 5584046..b6d5ef5 100644 --- a/src/url.c +++ b/src/url.c @@ -27,7 +27,6 @@ */ #include -#include #include #include #include diff --git a/src/wfp.c b/src/wfp.c index da2cd1b..f2728d7 100644 --- a/src/wfp.c +++ b/src/wfp.c @@ -33,7 +33,6 @@ #include #include "ldb.h" #include "minr.h" -#include "md5.h" #include "ignorelist.h" #include "winnowing.h" #include "hex.h"