Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Files health check with checksum match count #7380

Merged
merged 3 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 74 additions & 21 deletions app/api/files.v2/FilesHealthCheck.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import { FilesDataSource } from './contracts/FilesDataSource';
import { FileStorage } from './contracts/FileStorage';
import { StoredFile } from './model/StoredFile';
import { URLAttachment } from './model/URLAttachment';

function filterFilesInStorage(files: string[]) {
function filterFilesInStorage(files: StoredFile[]) {
return files.filter(
file =>
!file.includes('/log/') && !file.includes('/segmentation/') && !file.includes('index.html')
!file.fullPath.includes('/log/') &&
!file.fullPath.includes('/segmentation/') &&
!file.fullPath.includes('index.html')
);
}

type missingInDBFileDTO = {
filename: string;
checksumMatchCount: number;
};

export class FilesHealthCheck {
// eslint-disable-next-line class-methods-use-this
private onMissingInDBCB: (filename: string) => void = () => {};
private onMissingInDBCB: (file: missingInDBFileDTO) => void = () => {};

// eslint-disable-next-line class-methods-use-this
private onMissingInStorageCB: (fileDTO: { _id: string; filename: string }) => void = () => {};
Expand All @@ -26,42 +33,88 @@ export class FilesHealthCheck {
}

async execute() {
const allFilesInDb = await this.filesDS.getAll().all();
const allFilesInStorage = await this.fileStorage.list();
const filteredFilesInStorage = new Set(filterFilesInStorage(allFilesInStorage));
let missingInStorage = 0;
const { dbFiles, storageFiles, filesChecksumMatchCounts, storageFilesIndexedByPath } =
await this.getFilesData();
const countInStorage = storageFiles.length;
const missingInStorageList: string[] = [];
const missingInDbList: string[] = [];
const countInStorage = filteredFilesInStorage.size;
let countInDb = 0;

allFilesInDb.forEach(file => {
countInDb += 1;
const existsInStorage = filteredFilesInStorage.delete(this.fileStorage.getPath(file));
const counters = {
missingInStorage: 0,
missingInDbWithChecksumMatches: 0,
countInDb: 0,
};

dbFiles.forEach(file => {
counters.countInDb += 1;

const existsInStorage = storageFilesIndexedByPath[this.fileStorage.getPath(file)];

if (existsInStorage) {
delete storageFilesIndexedByPath[this.fileStorage.getPath(file)];
}

if (!existsInStorage && !(file instanceof URLAttachment)) {
missingInStorage += 1;
counters.missingInStorage += 1;
missingInStorageList.push(this.fileStorage.getPath(file));
this.onMissingInStorageCB({ _id: file.id, filename: file.filename });
}
});

filteredFilesInStorage.forEach(file => {
missingInDbList.push(file);
this.onMissingInDBCB(file);
const storageFilesRemaining = Object.values(storageFilesIndexedByPath);

storageFilesRemaining.forEach(storedFile => {
missingInDbList.push(storedFile.fullPath);
const checksumMatchCount = filesChecksumMatchCounts[storedFile.checksum || ''];
if (checksumMatchCount > 1) {
counters.missingInDbWithChecksumMatches += 1;
}
this.onMissingInDBCB({ filename: storedFile.fullPath, checksumMatchCount });
});

return {
missingInStorageList,
missingInStorage,
missingInDbList,
missingInDb: filteredFilesInStorage.size,
countInDb,
missingInDb: storageFilesRemaining.length,
countInStorage,
...counters,
};
}

async getFilesData() {
const allFilesInStorage = await this.fileStorage.list();
const storageFiles = filterFilesInStorage(allFilesInStorage);
const filesChecksumMatchCounts = storageFiles.reduce(
(counts, storedFile) => {
const checksum = storedFile.checksum || '';
if (!counts[checksum]) {
// eslint-disable-next-line no-param-reassign
counts[checksum] = 0;
}
// eslint-disable-next-line no-param-reassign
counts[checksum] += 1;
return counts;
},
{} as { [k: string]: number }
);
const dbFiles = await this.filesDS.getAll().all();
const storageFilesIndexedByPath = storageFiles.reduce(
(memo, file) => {
// eslint-disable-next-line no-param-reassign
memo[file.fullPath] = file;
return memo;
},
{} as { [k: string]: StoredFile }
);
return {
storageFilesIndexedByPath,
storageFiles,
filesChecksumMatchCounts,
dbFiles,
};
}

onMissingInDB(cb: (filename: string) => void) {
onMissingInDB(cb: (file: missingInDBFileDTO) => void) {
this.onMissingInDBCB = cb;
}

Expand Down
3 changes: 2 additions & 1 deletion app/api/files.v2/contracts/FileStorage.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { StoredFile } from '../model/StoredFile';
import { UwaziFile } from '../model/UwaziFile';

export interface FileStorage {
list(): Promise<string[]>;
list(): Promise<StoredFile[]>;
getPath(file: UwaziFile): string;
}
5 changes: 3 additions & 2 deletions app/api/files.v2/infrastructure/S3FileStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { Attachment } from '../model/Attachment';
import { UwaziFile } from '../model/UwaziFile';
import { URLAttachment } from '../model/URLAttachment';
import { CustomUpload } from '../model/CustomUpload';
import { StoredFile } from '../model/StoredFile';

export class S3FileStorage implements FileStorage {
private s3Client: S3Client;
Expand All @@ -31,7 +32,7 @@ export class S3FileStorage implements FileStorage {
return path.join(this.tenant.uploadedDocuments, file.filename);
}

async list(): Promise<string[]> {
async list() {
const objects: _Object[] = [];
const requestNext = async (token?: string) => {
const response = await this.s3Client.send(
Expand All @@ -52,6 +53,6 @@ export class S3FileStorage implements FileStorage {
continuationToken = await requestNext(continuationToken);
}

return objects.map(c => c.Key!);
return objects.map(c => new StoredFile(c.Key!, c.ETag!));
}
}
4 changes: 2 additions & 2 deletions app/api/files.v2/infrastructure/specs/S3FileStorage.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ describe('S3FileStorage', () => {

const listedFiles = await s3fileStorage.list();

expect(listedFiles.sort()).toEqual(
expect(listedFiles.map(f => f.fullPath).sort()).toEqual(
['test-tenant/documents/document1', 'test-tenant/documents/document2'].sort()
);
});
Expand All @@ -125,7 +125,7 @@ describe('S3FileStorage', () => {

const listedFiles = await s3fileStorage.list();

expect(listedFiles.sort()).toEqual(
expect(listedFiles.map(f => f.fullPath).sort()).toEqual(
['test-tenant/documents/document1', 'test-tenant/documents/document2'].sort()
);
});
Expand Down
15 changes: 15 additions & 0 deletions app/api/files.v2/model/StoredFile.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import path from 'path';

export class StoredFile {
readonly filename: string;

readonly fullPath: string;

readonly checksum?: string;

constructor(fullPath: string, checksum?: string) {
this.filename = path.basename(fullPath);
this.fullPath = fullPath;
this.checksum = checksum;
}
}
112 changes: 84 additions & 28 deletions app/api/files.v2/specs/FilesHealthCheck.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { DefaultFilesDataSource } from '../database/data_source_defaults';
import { UwaziFile } from '../model/UwaziFile';
import { URLAttachment } from '../model/URLAttachment';
import { CustomUpload } from '../model/CustomUpload';
import { StoredFile } from '../model/StoredFile';

const factory = getFixturesFactory();

Expand All @@ -18,7 +19,7 @@ afterAll(async () => {
await testingEnvironment.tearDown();
});

let testStorageFiles: string[] = [];
let testStorageFiles: StoredFile[] = [];
class TestFileStorage implements FileStorage {
// eslint-disable-next-line class-methods-use-this
getPath(file: UwaziFile): string {
Expand All @@ -32,14 +33,14 @@ class TestFileStorage implements FileStorage {
}

// eslint-disable-next-line class-methods-use-this
async list(): Promise<string[]> {
async list() {
return testStorageFiles;
}
}

describe('FilesHealthCheck', () => {
let filesHealthCheck: FilesHealthCheck;
let filesHealthCheck: FilesHealthCheck;

describe('FilesHealthCheck', () => {
beforeEach(() => {
filesHealthCheck = new FilesHealthCheck(
new TestFileStorage(),
Expand All @@ -48,7 +49,7 @@ describe('FilesHealthCheck', () => {
});

it('should report full count in storage and in db', async () => {
testStorageFiles = ['document/file1', 'document/file3'];
testStorageFiles = [new StoredFile('document/file1'), new StoredFile('document/file3')];
await testingEnvironment.setUp({
files: [factory.document('file1'), factory.document('file2'), factory.document('file4')],
});
Expand All @@ -62,7 +63,11 @@ describe('FilesHealthCheck', () => {
});

it('should report missing in storage files', async () => {
testStorageFiles = ['document/file1', 'document/file3', 'custom_uploads/custom1'];
testStorageFiles = [
new StoredFile('document/file1'),
new StoredFile('document/file3'),
new StoredFile('custom_uploads/custom1'),
];
await testingEnvironment.setUp({
files: [
factory.document('file1'),
Expand All @@ -82,7 +87,12 @@ describe('FilesHealthCheck', () => {
});

it('should report missing in DB files', async () => {
testStorageFiles = ['document/file1', 'document/file2', 'document/file3', 'document/file4'];
testStorageFiles = [
new StoredFile('document/file1'),
new StoredFile('document/file2'),
new StoredFile('document/file3'),
new StoredFile('document/file4'),
];
await testingEnvironment.setUp({
files: [factory.document('file2'), factory.document('file3')],
});
Expand All @@ -97,11 +107,11 @@ describe('FilesHealthCheck', () => {

it('should ignore all /log files', async () => {
testStorageFiles = [
'/log/1-activity.log',
'/log/log.log',
'/log/error.log',
'/log/debug.log',
'/document/file1',
new StoredFile('/log/1-activity.log'),
new StoredFile('/log/log.log'),
new StoredFile('/log/error.log'),
new StoredFile('/log/debug.log'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -115,9 +125,9 @@ describe('FilesHealthCheck', () => {

it('should ignore all /segmentation files', async () => {
testStorageFiles = [
'/segmentation/1-activity.log',
'/documents/segmentation/1-activity.log',
'/document/file1',
new StoredFile('/segmentation/1-activity.log'),
new StoredFile('/documents/segmentation/1-activity.log'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -131,10 +141,10 @@ describe('FilesHealthCheck', () => {

it('should ignore all index.html files', async () => {
testStorageFiles = [
'/documents/index.html',
'/index.html',
'/segmentation/index.html',
'/document/file1',
new StoredFile('/documents/index.html'),
new StoredFile('/index.html'),
new StoredFile('/segmentation/index.html'),
new StoredFile('/document/file1'),
];
await testingEnvironment.setUp({ files: [] });

Expand All @@ -147,7 +157,7 @@ describe('FilesHealthCheck', () => {
});

it('should ignore external attachemnts (have url)', async () => {
testStorageFiles = ['document/file1'];
testStorageFiles = [new StoredFile('document/file1')];
await testingEnvironment.setUp({ files: [factory.attachment('url_file', { url: 'url' })] });

const summary = await filesHealthCheck.execute();
Expand All @@ -158,17 +168,63 @@ describe('FilesHealthCheck', () => {
});
});

it('should be able to subscribe to an "event" for each file missing in db', async () => {
testStorageFiles = ['document/file1', 'document/file2'];
await testingEnvironment.setUp({ files: [] });
describe('onMissingInDB', () => {
it('should emit each file that is missing', async () => {
testStorageFiles = [new StoredFile('document/file1'), new StoredFile('document/file2')];
await testingEnvironment.setUp({ files: [] });

const events: string[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
const events: { filename: string }[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
});

await filesHealthCheck.execute();
expect(events.map(e => e.filename)).toEqual(['document/file1', 'document/file2']);
});

await filesHealthCheck.execute();
expect(events).toEqual(['document/file1', 'document/file2']);
it('should emit the count of duplicated checksums for each file emited', async () => {
testStorageFiles = [
new StoredFile('document/file1', 'checksum1'),
new StoredFile('document/file2', 'checksum2'),
new StoredFile('document/file3'),
new StoredFile('document/file4', 'checksum1'),
new StoredFile('document/file5', 'checksum1'),
new StoredFile('document/file6', 'checksum2'),
];
await testingEnvironment.setUp({ files: [] });

const events: { filename: string }[] = [];
filesHealthCheck.onMissingInDB(file => {
events.push(file);
});

await filesHealthCheck.execute();
expect(events).toMatchObject([
{ filename: 'document/file1', checksumMatchCount: 3 },
{ filename: 'document/file2', checksumMatchCount: 2 },
{ filename: 'document/file3', checksumMatchCount: 1 },
{ filename: 'document/file4', checksumMatchCount: 3 },
{ filename: 'document/file5', checksumMatchCount: 3 },
{ filename: 'document/file6', checksumMatchCount: 2 },
]);
});

it('should emit in the summary the number of files which have a checksum match count > 1', async () => {
testStorageFiles = [
new StoredFile('document/file1', 'checksum1'),
new StoredFile('document/file2', 'checksum2'),
new StoredFile('document/file3'),
new StoredFile('document/file4', 'checksum1'),
new StoredFile('document/file5', 'checksum1'),
new StoredFile('document/file6', 'checksum2'),
];
await testingEnvironment.setUp({ files: [] });

const summary = await filesHealthCheck.execute();
expect(summary).toMatchObject({
missingInDbWithChecksumMatches: 5,
});
});
});

it('should be able to subscribe to an "event" for each file missing in storage', async () => {
Expand Down
Loading
Loading