Skip to content

Commit

Permalink
Files health check with checksum match count (#7380)
Browse files Browse the repository at this point in the history
* WIP, return checksum match counts

this counts represent how many files are exactly the same

* fix lint errors and improve performance
  • Loading branch information
daneryl authored Oct 18, 2024
1 parent d78c406 commit ee9b9c5
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 55 deletions.
95 changes: 74 additions & 21 deletions app/api/files.v2/FilesHealthCheck.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import { FilesDataSource } from './contracts/FilesDataSource';
import { FileStorage } from './contracts/FileStorage';
import { StoredFile } from './model/StoredFile';
import { URLAttachment } from './model/URLAttachment';

function filterFilesInStorage(files: string[]) {
function filterFilesInStorage(files: StoredFile[]) {
return files.filter(
file =>
!file.includes('/log/') && !file.includes('/segmentation/') && !file.includes('index.html')
!file.fullPath.includes('/log/') &&
!file.fullPath.includes('/segmentation/') &&
!file.fullPath.includes('index.html')
);
}

type missingInDBFileDTO = {
filename: string;
checksumMatchCount: number;
};

export class FilesHealthCheck {
// eslint-disable-next-line class-methods-use-this
private onMissingInDBCB: (filename: string) => void = () => {};
private onMissingInDBCB: (file: missingInDBFileDTO) => void = () => {};

// eslint-disable-next-line class-methods-use-this
private onMissingInStorageCB: (fileDTO: { _id: string; filename: string }) => void = () => {};
Expand All @@ -26,42 +33,88 @@ export class FilesHealthCheck {
}

async execute() {
const allFilesInDb = await this.filesDS.getAll().all();
const allFilesInStorage = await this.fileStorage.list();
const filteredFilesInStorage = new Set(filterFilesInStorage(allFilesInStorage));
let missingInStorage = 0;
const { dbFiles, storageFiles, filesChecksumMatchCounts, storageFilesIndexedByPath } =
await this.getFilesData();
const countInStorage = storageFiles.length;
const missingInStorageList: string[] = [];
const missingInDbList: string[] = [];
const countInStorage = filteredFilesInStorage.size;
let countInDb = 0;

allFilesInDb.forEach(file => {
countInDb += 1;
const existsInStorage = filteredFilesInStorage.delete(this.fileStorage.getPath(file));
const counters = {
missingInStorage: 0,
missingInDbWithChecksumMatches: 0,
countInDb: 0,
};

dbFiles.forEach(file => {
counters.countInDb += 1;

const existsInStorage = storageFilesIndexedByPath[this.fileStorage.getPath(file)];

if (existsInStorage) {
delete storageFilesIndexedByPath[this.fileStorage.getPath(file)];
}

if (!existsInStorage && !(file instanceof URLAttachment)) {
missingInStorage += 1;
counters.missingInStorage += 1;
missingInStorageList.push(this.fileStorage.getPath(file));
this.onMissingInStorageCB({ _id: file.id, filename: file.filename });
}
});

filteredFilesInStorage.forEach(file => {
missingInDbList.push(file);
this.onMissingInDBCB(file);
const storageFilesRemaining = Object.values(storageFilesIndexedByPath);

storageFilesRemaining.forEach(storedFile => {
missingInDbList.push(storedFile.fullPath);
const checksumMatchCount = filesChecksumMatchCounts[storedFile.checksum || ''];
if (checksumMatchCount > 1) {
counters.missingInDbWithChecksumMatches += 1;
}
this.onMissingInDBCB({ filename: storedFile.fullPath, checksumMatchCount });
});

return {
missingInStorageList,
missingInStorage,
missingInDbList,
missingInDb: filteredFilesInStorage.size,
countInDb,
missingInDb: storageFilesRemaining.length,
countInStorage,
...counters,
};
}

async getFilesData() {
const allFilesInStorage = await this.fileStorage.list();
const storageFiles = filterFilesInStorage(allFilesInStorage);
const filesChecksumMatchCounts = storageFiles.reduce(
(counts, storedFile) => {
const checksum = storedFile.checksum || '';
if (!counts[checksum]) {
// eslint-disable-next-line no-param-reassign
counts[checksum] = 0;
}
// eslint-disable-next-line no-param-reassign
counts[checksum] += 1;
return counts;
},
{} as { [k: string]: number }
);
const dbFiles = await this.filesDS.getAll().all();
const storageFilesIndexedByPath = storageFiles.reduce(
(memo, file) => {
// eslint-disable-next-line no-param-reassign
memo[file.fullPath] = file;
return memo;
},
{} as { [k: string]: StoredFile }
);
return {
storageFilesIndexedByPath,
storageFiles,
filesChecksumMatchCounts,
dbFiles,
};
}

onMissingInDB(cb: (filename: string) => void) {
onMissingInDB(cb: (file: missingInDBFileDTO) => void) {
this.onMissingInDBCB = cb;
}

Expand Down
3 changes: 2 additions & 1 deletion app/api/files.v2/contracts/FileStorage.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { StoredFile } from '../model/StoredFile';
import { UwaziFile } from '../model/UwaziFile';

export interface FileStorage {
list(): Promise<string[]>;
list(): Promise<StoredFile[]>;
getPath(file: UwaziFile): string;
}
5 changes: 3 additions & 2 deletions app/api/files.v2/infrastructure/S3FileStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { Attachment } from '../model/Attachment';
import { UwaziFile } from '../model/UwaziFile';
import { URLAttachment } from '../model/URLAttachment';
import { CustomUpload } from '../model/CustomUpload';
import { StoredFile } from '../model/StoredFile';

export class S3FileStorage implements FileStorage {
private s3Client: S3Client;
Expand All @@ -31,7 +32,7 @@ export class S3FileStorage implements FileStorage {
return path.join(this.tenant.uploadedDocuments, file.filename);
}

async list(): Promise<string[]> {
async list() {
const objects: _Object[] = [];
const requestNext = async (token?: string) => {
const response = await this.s3Client.send(
Expand All @@ -52,6 +53,6 @@ export class S3FileStorage implements FileStorage {
continuationToken = await requestNext(continuationToken);
}

return objects.map(c => c.Key!);
return objects.map(c => new StoredFile(c.Key!, c.ETag!));
}
}
4 changes: 2 additions & 2 deletions app/api/files.v2/infrastructure/specs/S3FileStorage.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ describe('S3FileStorage', () => {

const listedFiles = await s3fileStorage.list();

expect(listedFiles.sort()).toEqual(
expect(listedFiles.map(f => f.fullPath).sort()).toEqual(
['test-tenant/documents/document1', 'test-tenant/documents/document2'].sort()
);
});
Expand All @@ -125,7 +125,7 @@ describe('S3FileStorage', () => {

const listedFiles = await s3fileStorage.list();

expect(listedFiles.sort()).toEqual(
expect(listedFiles.map(f => f.fullPath).sort()).toEqual(
['test-tenant/documents/document1', 'test-tenant/documents/document2'].sort()
);
});
Expand Down
15 changes: 15 additions & 0 deletions app/api/files.v2/model/StoredFile.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import path from 'path';

export class StoredFile {
readonly filename: string;

readonly fullPath: string;

readonly checksum?: string;

constructor(fullPath: string, checksum?: string) {
this.filename = path.basename(fullPath);
this.fullPath = fullPath;
this.checksum = checksum;
}
}
112 changes: 84 additions & 28 deletions app/api/files.v2/specs/FilesHealthCheck.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { DefaultFilesDataSource } from '../database/data_source_defaults';
import { UwaziFile } from '../model/UwaziFile';
import { URLAttachment } from '../model/URLAttachment';
import { CustomUpload } from '../model/CustomUpload';
import { StoredFile } from '../model/StoredFile';

const factory = getFixturesFactory();

Expand All @@ -18,7 +19,7 @@ afterAll(async () => {
await testingEnvironment.tearDown();
});

let testStorageFiles: string[] = [];
let testStorageFiles: StoredFile[] = [];
class TestFileStorage implements FileStorage {
// eslint-disable-next-line class-methods-use-this
getPath(file: UwaziFile): string {
Expand All @@ -32,14 +33,14 @@ class TestFileStorage implements FileStorage {
}

// eslint-disable-next-line class-methods-use-this
async list(): Promise<string[]> {
async list() {
return testStorageFiles;
}
}

describe('FilesHealthCheck', () => {
let filesHealthCheck: FilesHealthCheck;
let filesHealthCheck: FilesHealthCheck;

describe('FilesHealthCheck', () => {
beforeEach(() => {
filesHealthCheck = new FilesHealthCheck(
new TestFileStorage(),
Expand All @@ -48,7 +49,7 @@ describe('FilesHealthCheck', () => {
});

it('should report full count in storage and in db', async () => {
testStorageFiles = ['document/file1', 'document/file3'];
testStorageFiles = [new StoredFile('document/file1'), new StoredFile('document/file3')];
await testingEnvironment.setUp({
files: [factory.document('file1'), factory.document('file2'), factory.document('file4')],
});
Expand All @@ -62,7 +63,11 @@ describe('FilesHealthCheck', () => {
});

it('should report missing in storage files', async () => {
testStorageFiles = ['document/file1', 'document/file3', 'custom_uploads/custom1'];
testStorageFiles = [
new StoredFile('document/file1'),
new StoredFile('document/file3'),
new StoredFile('custom_uploads/custom1'),
];
await testingEnvironment.setUp({
files: [
factory.document('file1'),
Expand All @@ -82,7 +87,12 @@ describe('FilesHealthCheck', () => {
});

it('should report missing in DB files', async () => {
testStorageFiles = ['document/file1', 'document/file2', 'document/file3', 'document/file4'];
testStorageFiles = [
new StoredFile('document/file1'),
new StoredFile('document/file2'),
new StoredFile('document/file3'),
new StoredFile('document/file4'),
];
await testingEnvironment.setUp({
files: [factory.document('file2'), factory.document('file3')],
});
Expand All @@ -97,11 +107,11 @@ describe('FilesHealthCheck', () => {

it('should ignore all /log files', async () => {
testStorageFiles = [
'/log/1-activity.log',
'/log/log.log',
'/log/error.log',
'/log/debug.log',
'/document/file1',
new StoredFile('/log/1-activity.log'),
new StoredFile('/log/log.log'),
new StoredFile('/log/error.log'),
new StoredFile('/log/debug.log'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -115,9 +125,9 @@ describe('FilesHealthCheck', () => {

it('should ignore all /segmentation files', async () => {
testStorageFiles = [
'/segmentation/1-activity.log',
'/documents/segmentation/1-activity.log',
'/document/file1',
new StoredFile('/segmentation/1-activity.log'),
new StoredFile('/documents/segmentation/1-activity.log'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -131,10 +141,10 @@ describe('FilesHealthCheck', () => {

it('should ignore all index.html files', async () => {
testStorageFiles = [
'/documents/index.html',
'/index.html',
'/segmentation/index.html',
'/document/file1',
new StoredFile('/documents/index.html'),
new StoredFile('/index.html'),
new StoredFile('/segmentation/index.html'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -147,7 +157,7 @@ describe('FilesHealthCheck', () => {
});

it('should ignore external attachemnts (have url)', async () => {
testStorageFiles = ['document/file1'];
testStorageFiles = [new StoredFile('document/file1')];
await testingEnvironment.setUp({ files: [factory.attachment('url_file', { url: 'url' })] });

const summary = await filesHealthCheck.execute();
Expand All @@ -158,17 +168,63 @@ describe('FilesHealthCheck', () => {
});
});

it('should be able to subscribe to an "event" for each file missing in db', async () => {
testStorageFiles = ['document/file1', 'document/file2'];
await testingEnvironment.setUp({ files: [] });
describe('onMissingInDB', () => {
it('should emit each file that is missing', async () => {
testStorageFiles = [new StoredFile('document/file1'), new StoredFile('document/file2')];
await testingEnvironment.setUp({ files: [] });

const events: string[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
const events: { filename: string }[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
});

await filesHealthCheck.execute();
expect(events.map(e => e.filename)).toEqual(['document/file1', 'document/file2']);
});

await filesHealthCheck.execute();
expect(events).toEqual(['document/file1', 'document/file2']);
it('should emit the count of duplicated checksums for each file emited', async () => {
testStorageFiles = [
new StoredFile('document/file1', 'checksum1'),
new StoredFile('document/file2', 'checksum2'),
new StoredFile('document/file3'),
new StoredFile('document/file4', 'checksum1'),
new StoredFile('document/file5', 'checksum1'),
new StoredFile('document/file6', 'checksum2'),
];
await testingEnvironment.setUp({ files: [] });

const events: { filename: string }[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
});

await filesHealthCheck.execute();
expect(events).toMatchObject([
{ filename: 'document/file1', checksumMatchCount: 3 },
{ filename: 'document/file2', checksumMatchCount: 2 },
{ filename: 'document/file3', checksumMatchCount: 1 },
{ filename: 'document/file4', checksumMatchCount: 3 },
{ filename: 'document/file5', checksumMatchCount: 3 },
{ filename: 'document/file6', checksumMatchCount: 2 },
]);
});

it('should emit in the summary the number of files which have a checksum match count > 1', async () => {
testStorageFiles = [
new StoredFile('document/file1', 'checksum1'),
new StoredFile('document/file2', 'checksum2'),
new StoredFile('document/file3'),
new StoredFile('document/file4', 'checksum1'),
new StoredFile('document/file5', 'checksum1'),
new StoredFile('document/file6', 'checksum2'),
];
await testingEnvironment.setUp({ files: [] });

const summary = await filesHealthCheck.execute();
expect(summary).toMatchObject({
missingInDbWithChecksumMatches: 5,
});
});
});

it('should be able to subscribe to an "event" for each file missing in storage', async () => {
Expand Down
Loading

0 comments on commit ee9b9c5

Please sign in to comment.