Skip to content

Commit

Permalink
add lines limit to wfp calculation (#49)
Browse files Browse the repository at this point in the history
Co-authored-by: scanossmining <info@scanoss.com>
  • Loading branch information
mscasso-scanoss and scanoss-cs authored Nov 24, 2023
1 parent d437b8b commit c9122d3
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
6 changes: 5 additions & 1 deletion external/src/winnowing.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
uint8_t GRAM = 30; // Winnowing gram size in bytes
uint8_t WINDOW = 64; // Winnowing window size in bytes
uint32_t MAX_UINT32 = 4294967295;
int MAX_FILE_SIZE = 4 * 1048576;

/* Convert case to lowercase, and return zero if it isn't a letter or number
Do it fast and independent from the locale configuration (avoid string.h) */
Expand Down Expand Up @@ -99,6 +98,11 @@ uint32_t winnowing(char *src, uint32_t *hashes, uint32_t *lines, uint32_t limit,
{
if (*src == '\n') line++;

if (line > 16384)
{
break;
}

uint8_t byte = normalize(*(src++));
if (!byte) continue;

Expand Down
1 change: 0 additions & 1 deletion inc/minr.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#define MAX_FILE_HEADER 4096
#define MAX_HEADER_LINES 30
#define MAX_CSV_LINE_LEN 1024
#define BUFFER_SIZE 1048576
#define MIN_FILE_SIZE 256 // files below this size will be ignored
#define DISCARD_PATH_IF_LONGER_THAN 1024
#define MD5_LEN 16
Expand Down
2 changes: 2 additions & 0 deletions src/md5.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
* @param path string path
* @return uint8_t* pointer to md5
*/
#define BUFFER_SIZE 1048576

uint8_t *file_md5 (char *path)
{
uint8_t *c = calloc(16,1);
Expand Down
20 changes: 10 additions & 10 deletions src/wfp.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ void wfp_free(void)
free(out_snippet);
}

#define WFP_BUFFER_SIZE 1048576

/**
* @brief Extrac wfp from a surce
*
Expand Down Expand Up @@ -91,7 +93,7 @@ void extract_wfp(uint8_t *md5, char *src, uint32_t length, bool check_mz)

uint8_t *grams = calloc(mem_alloc,1);
uint32_t *windows = calloc (mem_alloc*4,1);
uint8_t *buffer = malloc(BUFFER_SIZE * 256);
uint8_t *buffer = malloc(WFP_BUFFER_SIZE * 256);
uint32_t *hashes = malloc(mem_alloc);
uint32_t *lines = malloc(mem_alloc);

Expand All @@ -107,38 +109,37 @@ void extract_wfp(uint8_t *md5, char *src, uint32_t length, bool check_mz)
uint32_reverse((uint8_t *)&hashes[i]);
n = *(uint8_t *)(&hashes[i]);

memcpy(buffer + (BUFFER_SIZE * n) + buffer_ln[n], (char *) &hashes[i] + 1, 3);
memcpy(buffer + (WFP_BUFFER_SIZE * n) + buffer_ln[n], (char *) &hashes[i] + 1, 3);
buffer_ln[n] += 3;

/* Copy md5 hash (16 bytes) */
memcpy(buffer + (BUFFER_SIZE * n) + buffer_ln[n], (char *) md5, 16);
memcpy(buffer + (WFP_BUFFER_SIZE * n) + buffer_ln[n], (char *) md5, 16);
buffer_ln[n] += 16;

/* Copy originating line number */
line = (lines[i] > 65535) ? 65535 : lines[i];
memcpy(buffer + (BUFFER_SIZE * n) + buffer_ln[n], (char *)&line, 2);
memcpy(buffer + (WFP_BUFFER_SIZE * n) + buffer_ln[n], (char *)&line, 2);
buffer_ln[n] += 2;

/* Flush buffer */
if (buffer_ln[n] + 21 >= BUFFER_SIZE)
if (buffer_ln[n] + 21 >= WFP_BUFFER_SIZE)
{
if (!write(out_snippet[n], buffer + (n * BUFFER_SIZE), buffer_ln[n]))
if (!write(out_snippet[n], buffer + (n * WFP_BUFFER_SIZE), buffer_ln[n]))
printf("Warning: error writing snippet sector\n");
buffer_ln[n] = 0;
}
}

/* Flush buffer */
for (int i = 0; i < 256; i++)
if (buffer_ln[i]) if (!write(out_snippet[i], buffer + (BUFFER_SIZE * i), buffer_ln[i]))
if (buffer_ln[i]) if (!write(out_snippet[i], buffer + (WFP_BUFFER_SIZE * i), buffer_ln[i]))
printf("Warning: error writing snippet sector\n");

free (grams);
free (windows);
free (buffer);
free (hashes);
free (lines);

free (grams);
}

/**
Expand All @@ -155,7 +156,6 @@ bool mz_wfp_extract_handler(struct mz_job *job)
/* Decompress */
MZ_DEFLATE(job);
job->data[job->data_ln] = 0;

extract_wfp(job->ptr, job->data, job->data_ln, true);
free(job->data);

Expand Down

0 comments on commit c9122d3

Please sign in to comment.