Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3242 slow csv import with thesaurus creation #3927

Merged
merged 26 commits into from
Sep 29, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7e0e44b
first implementation
LaszloKecskes Sep 20, 2021
51e7136
added extra test
LaszloKecskes Sep 20, 2021
0de48ce
updated select typeparser
LaszloKecskes Sep 20, 2021
a99bf66
updated multiselect typeparser and failing tests
LaszloKecskes Sep 21, 2021
baa367e
code cleanup
LaszloKecskes Sep 21, 2021
9f5a68e
added error handling
LaszloKecskes Sep 21, 2021
4d01edd
fix code climate issues
LaszloKecskes Sep 21, 2021
b43cfb3
further code cleanup
LaszloKecskes Sep 21, 2021
33ecda7
Merge branch 'development' into 3242_slow_csv_import_with_thesaurus_c…
LaszloKecskes Sep 21, 2021
8cf7de1
removing unneccessary async
LaszloKecskes Sep 21, 2021
1ab2637
removed Readable option from importFile
LaszloKecskes Sep 22, 2021
a6cef0f
corrected failing unit tests
LaszloKecskes Sep 22, 2021
38d36f0
Merge branch '3242_slow_csv_import_with_thesaurus_creation' of https:…
LaszloKecskes Sep 22, 2021
8ae1ef1
added smoke test
LaszloKecskes Sep 22, 2021
af075d5
handling languages
LaszloKecskes Sep 22, 2021
ee3eb2c
updated test with languages
LaszloKecskes Sep 22, 2021
a4bc6c3
corrected missing translation problem
LaszloKecskes Sep 23, 2021
c5f6842
removing eslint line
LaszloKecskes Sep 24, 2021
6e0e903
changing thesauri database query to single
LaszloKecskes Sep 24, 2021
23c12cd
changed error handling on arrangeThesauri
LaszloKecskes Sep 24, 2021
c5f5b1f
refactored arrangeThesauri
LaszloKecskes Sep 24, 2021
ef998da
updated tests
LaszloKecskes Sep 24, 2021
b42630d
changing a test description
LaszloKecskes Sep 24, 2021
4cae862
removed select-multiselect differentiation
LaszloKecskes Sep 28, 2021
ab7986a
removed error catch, added typing, refactored functions
LaszloKecskes Sep 28, 2021
34898b2
Merge branch 'development' into 3242_slow_csv_import_with_thesaurus_c…
daneryl Sep 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions app/api/csv/arrangeThesauri.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import { ImportFile } from 'api/csv/importFile';
import thesauri from 'api/thesauri';
import { propertyTypes } from 'shared/propertyTypes';
import { PropertySchema } from 'shared/types/commonTypes';
import { TemplateSchema } from 'shared/types/templateType';
import { ThesaurusSchema } from 'shared/types/thesaurusType';

import csv, { CSVRow } from './csv';
import { splitMultiselectLabels } from './typeParsers/multiselect';
import { normalizeThesaurusLabel } from './typeParsers/select';

const filterJSObject = (input: { [k: string]: any }, keys: string[]): { [k: string]: any } => {
const result: { [k: string]: any } = {};
keys.forEach(k => {
if (input.hasOwnProperty(k)) {
result[k] = input[k];
}
});
return result;
};

class ArrangeThesauriError extends Error {
source: Error;
row: CSVRow;
index: number;

constructor(source: Error, row: CSVRow, index: number) {
super(source.message);
this.source = source;
this.row = row;
this.index = index;
}
}

const separateSelectAndMultiselectThesauri = (
thesauriRelatedProperties: PropertySchema[] | undefined,
languages?: string[]
): [{ [k: string]: string }, { [k: string]: string }] => {
const nameToThesauriIdSelects: { [k: string]: string } = {};
const nameToThesauriIdMultiselects: { [k: string]: string } = {};

thesauriRelatedProperties?.forEach(p => {
if (p.content && p.type) {
const thesarusID = p.content.toString();
if (p.type === propertyTypes.select) {
nameToThesauriIdSelects[p.name] = thesarusID;
languages?.forEach(suffix => {
nameToThesauriIdSelects[`${p.name}__${suffix}`] = thesarusID;
});
} else if (p.type === propertyTypes.multiselect) {
nameToThesauriIdMultiselects[p.name] = thesarusID;
languages?.forEach(suffix => {
nameToThesauriIdMultiselects[`${p.name}__${suffix}`] = thesarusID;
});
}
}
});

return [nameToThesauriIdSelects, nameToThesauriIdMultiselects];
};

type ThesauriValueData = {
thesauriIdToExistingValues: Map<string, Set<string>>;
thesauriIdToNewValues: Map<string, Set<string>>;
thesauriIdToNormalizedNewValues: Map<string, Set<string>>;
};

const setupIdValueMaps = (allRelatedThesauri: ThesaurusSchema[]): ThesauriValueData => {
const thesauriIdToExistingValues = new Map();
const thesauriIdToNewValues = new Map();
const thesauriIdToNormalizedNewValues = new Map();

allRelatedThesauri.forEach(t => {
if (t._id) {
daneryl marked this conversation as resolved.
Show resolved Hide resolved
const id = t._id.toString();
thesauriIdToExistingValues.set(
id,
new Set(t.values?.map(v => normalizeThesaurusLabel(v.label)))
);
thesauriIdToNewValues.set(id, new Set());
thesauriIdToNormalizedNewValues.set(id, new Set());
}
});

return { thesauriIdToExistingValues, thesauriIdToNewValues, thesauriIdToNormalizedNewValues };
};

const handleLabels = (
id: string,
original: string,
normalized: string | null,
thesauriValueData: ThesauriValueData
) => {
if (
normalized &&
!thesauriValueData.thesauriIdToExistingValues.get(id)?.has(normalized) &&
!thesauriValueData.thesauriIdToNormalizedNewValues.get(id)?.has(normalized)
) {
thesauriValueData.thesauriIdToNewValues.get(id)?.add(original);
thesauriValueData.thesauriIdToNormalizedNewValues.get(id)?.add(normalized);
}
};

const syncSaveThesauri = async (
allRelatedThesauri: ThesaurusSchema[],
thesauriIdToNewValues: Map<string, Set<string>>
) => {
for (let i = 0; i < allRelatedThesauri.length; i += 1) {
const thesaurus = allRelatedThesauri[i];
if (thesaurus?._id) {
const newValues: { label: string }[] = Array.from(
daneryl marked this conversation as resolved.
Show resolved Hide resolved
thesauriIdToNewValues.get(thesaurus._id.toString()) || []
).map(tval => ({ label: tval }));
if (newValues.length > 0) {
daneryl marked this conversation as resolved.
Show resolved Hide resolved
const thesaurusValues = thesaurus.values || [];
// eslint-disable-next-line no-await-in-loop
await thesauri.save({
...thesaurus,
values: thesaurusValues.concat(newValues),
});
}
}
}
};

const arrangeThesauri = async (
file: ImportFile,
template: TemplateSchema,
languages?: string[],
stopOnError: boolean = true
) => {
const thesauriRelatedProperties = template.properties?.filter(p =>
['select', 'multiselect'].includes(p.type)
);

let [
nameToThesauriIdSelects,
nameToThesauriIdMultiselects,
] = separateSelectAndMultiselectThesauri(thesauriRelatedProperties, languages);
daneryl marked this conversation as resolved.
Show resolved Hide resolved

const allRelatedThesauri = await thesauri.get({
$in: Array.from(
new Set(thesauriRelatedProperties?.map(p => p.content?.toString()).filter(t => t))
),
});

const thesauriValueData = setupIdValueMaps(allRelatedThesauri);

await csv(await file.readStream(), stopOnError)
.onRow(async (row: CSVRow, index: number) => {
if (index === 0) {
const columnnames = Object.keys(row);
nameToThesauriIdSelects = filterJSObject(nameToThesauriIdSelects, columnnames);
nameToThesauriIdMultiselects = filterJSObject(nameToThesauriIdMultiselects, columnnames);
}
daneryl marked this conversation as resolved.
Show resolved Hide resolved
Object.entries(nameToThesauriIdSelects).forEach(([name, id]) => {
const label = row[name];
if (label) {
const normalizedLabel = normalizeThesaurusLabel(label);
handleLabels(id, label, normalizedLabel, thesauriValueData);
}
});
Object.entries(nameToThesauriIdMultiselects).forEach(([name, id]) => {
const labels = splitMultiselectLabels(row[name]);
if (labels) {
daneryl marked this conversation as resolved.
Show resolved Hide resolved
Object.entries(labels).forEach(([normalizedLabel, originalLabel]) => {
handleLabels(id, originalLabel, normalizedLabel, thesauriValueData);
daneryl marked this conversation as resolved.
Show resolved Hide resolved
});
}
});
})
.onError(async (e: Error, row: CSVRow, index: number) => {
throw new ArrangeThesauriError(e, row, index);
})
.read();

await syncSaveThesauri(allRelatedThesauri, thesauriValueData.thesauriIdToNewValues);
};

export { arrangeThesauri, ArrangeThesauriError };
15 changes: 12 additions & 3 deletions app/api/csv/csvLoader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
/* eslint-disable max-statements */
import { EventEmitter } from 'events';

import templates from 'api/templates';
Expand All @@ -11,9 +10,10 @@ import { ThesaurusSchema } from 'shared/types/thesaurusType';

import { ensure } from 'shared/tsUtils';
import { ObjectId } from 'mongodb';
import { arrangeThesauri, ArrangeThesauriError } from './arrangeThesauri';
import csv, { CSVRow } from './csv';
import importFile from './importFile';
import { arrangeThesauri, importEntity, translateEntity } from './importEntity';
import { importEntity, translateEntity } from './importEntity';
import { extractEntity, toSafeName } from './entityRow';

export class CSVLoader extends EventEmitter {
Expand Down Expand Up @@ -56,7 +56,16 @@ export class CSVLoader extends EventEmitter {
(await settings.get()).languages
).map((l: LanguageSchema) => l.key);
const { newNameGeneration = false } = await settings.get();
await arrangeThesauri(file, template, availableLanguages, this);
try {
await arrangeThesauri(file, template, availableLanguages, this.stopOnError);
} catch (e) {
if (e instanceof ArrangeThesauriError) {
const _e: ArrangeThesauriError = e;
this.emit('loadError', _e.source, toSafeName(_e.row), _e.index);
} else {
throw e;
}
}
daneryl marked this conversation as resolved.
Show resolved Hide resolved

await csv(await file.readStream(), this.stopOnError)
.onRow(async (row: CSVRow) => {
Expand Down
123 changes: 2 additions & 121 deletions app/api/csv/importEntity.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
/* eslint-disable max-statements */
import entities from 'api/entities';
import { search } from 'api/search';
import entitiesModel from 'api/entities/entitiesModel';
import { processDocument } from 'api/files/processDocument';
import { RawEntity, toSafeName } from 'api/csv/entityRow';
import { RawEntity } from 'api/csv/entityRow';
import { TemplateSchema } from 'shared/types/templateType';
import { MetadataSchema, PropertySchema } from 'shared/types/commonTypes';
import { propertyTypes } from 'shared/propertyTypes';
import { ImportFile } from 'api/csv/importFile';
import thesauri from 'api/thesauri';
import { EntitySchema } from 'shared/types/entityType';
import { ensure } from 'shared/tsUtils';
import { attachmentsPath, files } from 'api/files';
import { generateID } from 'shared/IDGenerator';

import { normalizeThesaurusLabel } from './typeParsers/select';
import { splitMultiselectLabels } from './typeParsers/multiselect';
import typeParsers from './typeParsers';
import csv, { CSVRow } from './csv';

const parse = async (toImportEntity: RawEntity, prop: PropertySchema) =>
typeParsers[prop.type]
Expand Down Expand Up @@ -71,120 +66,6 @@ type Options = {
language: string;
};

const filterJSObject = (input: { [k: string]: any }, keys: string[]): { [k: string]: any } => {
const result: { [k: string]: any } = {};
keys.forEach(k => {
if (input.hasOwnProperty(k)) {
result[k] = input[k];
}
});
return result;
};

const arrangeThesauri = async (
file: ImportFile,
template: TemplateSchema,
languages?: string[],
errorContext?: any
) => {
let nameToThesauriIdSelects: { [k: string]: string } = {};
let nameToThesauriIdMultiselects: { [k: string]: string } = {};
const thesauriIdToExistingValues = new Map();
const thesauriIdToNewValues: Map<string, Set<string>> = new Map();
const thesauriIdToNormalizedNewValues = new Map();
const thesauriRelatedProperties = template.properties?.filter(p =>
['select', 'multiselect'].includes(p.type)
);
thesauriRelatedProperties?.forEach(p => {
if (p.content && p.type) {
const thesarusID = p.content.toString();
if (p.type === propertyTypes.select) {
nameToThesauriIdSelects[p.name] = thesarusID;
languages?.forEach(suffix => {
nameToThesauriIdSelects[`${p.name}__${suffix}`] = thesarusID;
});
} else if (p.type === propertyTypes.multiselect) {
nameToThesauriIdMultiselects[p.name] = thesarusID;
languages?.forEach(suffix => {
nameToThesauriIdMultiselects[`${p.name}__${suffix}`] = thesarusID;
});
}
}
});
const allRelatedThesauri = await Promise.all(
Array.from(
new Set(thesauriRelatedProperties?.map(p => p.content?.toString()).filter(t => t))
).map(async id => thesauri.getById(id))
);
allRelatedThesauri.forEach(t => {
if (t) {
const id = t._id.toString();
thesauriIdToExistingValues.set(
id,
new Set(t.values?.map(v => normalizeThesaurusLabel(v.label)))
);
thesauriIdToNewValues.set(id, new Set());
thesauriIdToNormalizedNewValues.set(id, new Set());
}
});
function handleLabels(id: string, original: string, normalized: string | null) {
if (
normalized &&
!thesauriIdToExistingValues.get(id).has(normalized) &&
!thesauriIdToNormalizedNewValues.get(id).has(normalized)
) {
thesauriIdToNewValues.get(id)?.add(original);
thesauriIdToNormalizedNewValues.get(id).add(normalized);
}
}
await csv(await file.readStream(), errorContext?.stopOnError)
.onRow(async (row: CSVRow, index: number) => {
if (index === 0) {
const columnnames = Object.keys(row);
nameToThesauriIdSelects = filterJSObject(nameToThesauriIdSelects, columnnames);
nameToThesauriIdMultiselects = filterJSObject(nameToThesauriIdMultiselects, columnnames);
}
Object.entries(nameToThesauriIdSelects).forEach(([name, id]) => {
const label = row[name];
if (label) {
const normalizedLabel = normalizeThesaurusLabel(label);
handleLabels(id, label, normalizedLabel);
}
});
Object.entries(nameToThesauriIdMultiselects).forEach(([name, id]) => {
const labels = splitMultiselectLabels(row[name]);
if (labels) {
Object.entries(labels).forEach(([normalizedLabel, originalLabel]) => {
handleLabels(id, originalLabel, normalizedLabel);
});
}
});
})
.onError(async (e: Error, row: CSVRow, index: number) => {
if (errorContext) {
errorContext._errors[index] = e;
errorContext.emit('loadError', e, toSafeName(row), index);
}
})
.read();
for (let i = 0; i < allRelatedThesauri.length; i += 1) {
const thesaurus = allRelatedThesauri[i];
if (thesaurus !== null) {
const newValues: { label: string }[] = Array.from(
thesauriIdToNewValues.get(thesaurus._id.toString()) || []
).map(tval => ({ label: tval }));
if (newValues.length > 0) {
const thesaurusValues = thesaurus.values || [];
// eslint-disable-next-line no-await-in-loop
await thesauri.save({
...thesaurus,
values: thesaurusValues.concat(newValues),
});
}
}
}
};

const importEntity = async (
toImportEntity: RawEntity,
template: TemplateSchema,
Expand Down Expand Up @@ -241,4 +122,4 @@ const translateEntity = async (
await search.indexEntities({ sharedId: entity.sharedId }, '+fullText');
};

export { arrangeThesauri, importEntity, translateEntity };
export { importEntity, translateEntity };
16 changes: 0 additions & 16 deletions app/api/csv/specs/csvLoader.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -169,22 +169,6 @@ describe('csvLoader', () => {
expect(textValues.length).toEqual(0);
});

it('should arrange translations for selects and multiselects', async () => {
const trs = await translations.get();
trs.forEach(tr => {
expect(tr.contexts.find(c => c.label === 'thesauri1').values).toMatchObject({
thesauri1: 'thesauri1',
thesauri2: 'thesauri2',
});
expect(tr.contexts.find(c => c.label === 'multi_select_thesaurus').values).toMatchObject({
multi_select_thesaurus: 'multi_select_thesaurus',
multivalue1: 'multivalue1',
multivalue2: 'multivalue2',
multivalue3: 'multivalue3',
});
});
});

describe('metadata parsing', () => {
it('should parse metadata properties by type using typeParsers', () => {
const textValues = imported.map(i => i.metadata.text_label[0].value);
Expand Down
Loading