Skip to content

Commit

Permalink
[Automatic Import] Better recognize (ND)JSON formats and send samples…
Browse files Browse the repository at this point in the history
…Format to the backend (#190588)

## Summary

This adds a `samplesFormat` group to the API. This group is filled out
by the frontend when parsing the provided samples and used to set the
log parsing specification for the produced integration.

We check this parameter to add toggle to support multiline
newline-delimited JSON in the filestream input.

## Release note

Automatic Import now supports the 'multiline newline-delimited JSON' log
sample format for the Filestream input.

## Detailed Explanation

We add the optional `samplesFormat` group to the API, consisting of 
 - `name`, 
 - (optional) `multiline`, 
 - and (optional) `json_path`.

Example values of this parameter:

- `{ name: 'ndjson', multiline: false }` for a newline-delimited JSON,
known as [NDJSON](https://github.com/ndjson/ndjson-spec) (where each
entry only takes one line)
- `{ name: 'ndjson', multiline: true }` for newline-delimited JSON where
each entry can span multiline lines
- `{ name: 'json', json_path: [] }` for valid JSON with the structure
`[{"key": "message1"}, {"key": "message2"}]`
- `{ name: 'json', json_path: ['events'] }` for valid JSON with the
structure `{"events": [{"key": "message1"}, {"key": "message2"}]}`

The `json_path` parameter is only relevant for `name: 'json'` and refers
to the path in the original JSON to the array representing the events to
ingest. Currently only one level is recognized:

Not all combinations of a log format with input type will work; more
supported combinations as well as better user feedback on unsupported
combinations will come later (see
elastic/security-team#10290).

In this PR we add support for the multiline NDJSON format for the
`fileinput` input type. This support comes in the form of the
user-changeable toggle under "Advanced Settings" that will be set to on
in cases where we multiline NDJSON format

---------

Co-authored-by: Marius Iversen <marius.iversen@elastic.co>
Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 22, 2024
1 parent a2873c0 commit 2a8b6d0
Show file tree
Hide file tree
Showing 19 changed files with 350 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export const testIntegration: Integration = {
},
],
},
samplesFormat: { name: 'ndjson', multiline: false },
},
],
};
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ export const ecsTestState = {
missingKeys: [],
invalidEcsFields: [],
results: { test: 'testresults' },
logFormat: 'testlogformat',
samplesFormat: 'testsamplesFormat',
ecsVersion: 'testversion',
currentMapping: { test1: 'test1' },
lastExecutedChain: 'testchain',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const getDataStreamMock = (): DataStream => ({
],
rawSamples,
pipeline: getPipelineMock(),
samplesFormat: { name: 'ndjson', multiline: false },
});

export const getIntegrationMock = (): Integration => ({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,30 @@ components:
items:
type: object

SamplesFormatName:
type: string
description: The name of the log samples format.
enum:
- ndjson
- json

SamplesFormat:
type: object
description: Format of the provided log samples.
required:
- name
properties:
name:
$ref: "#/components/schemas/SamplesFormatName"
multiline:
type: boolean
description: For some formats, specifies whether the samples can be multiline.
json_path:
type: array
description: For a JSON format, describes how to get to the sample array from the root of the JSON.
items:
type: string

Pipeline:
type: object
description: The pipeline object.
Expand Down Expand Up @@ -92,6 +116,7 @@ components:
- rawSamples
- pipeline
- docs
- samplesFormat
properties:
name:
type: string
Expand All @@ -116,6 +141,9 @@ components:
docs:
$ref: "#/components/schemas/Docs"
description: The documents of the dataStream.
samplesFormat:
$ref: "#/components/schemas/SamplesFormat"
description: The format of log samples in this dataStream.

Integration:
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,30 @@ export const Connector = z.string();
export type Docs = z.infer<typeof Docs>;
export const Docs = z.array(z.object({}).passthrough());

/**
* The name of the log samples format.
*/
export type SamplesFormatName = z.infer<typeof SamplesFormatName>;
export const SamplesFormatName = z.enum(['ndjson', 'json']);
export type SamplesFormatNameEnum = typeof SamplesFormatName.enum;
export const SamplesFormatNameEnum = SamplesFormatName.enum;

/**
* Format of the provided log samples.
*/
export type SamplesFormat = z.infer<typeof SamplesFormat>;
export const SamplesFormat = z.object({
name: SamplesFormatName,
/**
* For some formats, specifies whether the samples can be multiline.
*/
multiline: z.boolean().optional(),
/**
* For a JSON format, describes how to get to the sample array from the root of the JSON.
*/
json_path: z.array(z.string()).optional(),
});

/**
* The pipeline object.
*/
Expand Down Expand Up @@ -128,6 +152,10 @@ export const DataStream = z.object({
* The documents of the dataStream.
*/
docs: Docs,
/**
* The format of log samples in this dataStream.
*/
samplesFormat: SamplesFormat,
});

/**
Expand Down Expand Up @@ -163,11 +191,11 @@ export const Integration = z.object({
export type LangSmithOptions = z.infer<typeof LangSmithOptions>;
export const LangSmithOptions = z.object({
/**
* The project name to use with tracing.
* The project name.
*/
projectName: z.string(),
/**
* The api key for the project
* The apiKey to use for tracing.
*/
apiKey: z.string(),
});
1 change: 1 addition & 0 deletions x-pack/plugins/integration_assistant/common/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export type {
Integration,
Pipeline,
Docs,
SamplesFormat,
} from './api/model/common_attributes';
export type { ESProcessorItem } from './api/model/processor_attributes';

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ export const mockState: State = {
dataStreamDescription: 'Mocked Data Stream Description',
inputTypes: ['filestream'],
logsSampleParsed: rawSamples,
samplesFormat: { name: 'ndjson', multiline: false },
},
isGenerating: false,
result,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import React from 'react';
import { act, fireEvent, render, waitFor, type RenderResult } from '@testing-library/react';
import { TestProvider } from '../../../../../mocks/test_provider';
import { SampleLogsInput } from './sample_logs_input';
import { parseNDJSON, parseJSONArray, SampleLogsInput } from './sample_logs_input';
import { ActionsProvider } from '../../state';
import { mockActions } from '../../mocks/state';
import { mockServices } from '../../../../../services/mocks/services';
Expand All @@ -27,6 +27,119 @@ const changeFile = async (input: HTMLElement, file: File) => {
});
};

const simpleNDJSON = `{"message":"test message 1"}\n{"message":"test message 2"}`;
const multilineNDJSON = `{"message":"test message 1"}\n\n{\n "message":\n "test message 2"\n}\n\n`;
const splitNDJSON = simpleNDJSON.split('\n');
const complexEventsJSON = `{"events":[\n{"message":"test message 1"},\n{"message":"test message 2"}\n]}`;
const nonIdentifierLikeKeyInJSON = `{"1event":[\n{"message":"test message 1"},\n{"message":"test message 2"}\n]}`;

describe('parseNDJSON', () => {
const content = [{ message: 'test message 1' }, { message: 'test message 2' }];
const validNDJSONWithSpaces = `{"message":"test message 1"}
{"message":"test message 2"}`;
const singlelineArray = '[{"message":"test message 1"}, {"message":"test message 2"}]';
const multilineArray = '[{"message":"test message 1"},\n{"message":"test message 2"}]';

it('should parse valid NDJSON', () => {
expect(parseNDJSON(simpleNDJSON, false)).toEqual(content);
expect(parseNDJSON(simpleNDJSON, true)).toEqual(content);
});

it('should parse valid NDJSON with extra spaces in single-line mode', () => {
expect(parseNDJSON(validNDJSONWithSpaces, false)).toEqual(content);
});

it('should not parse valid NDJSON with extra spaces in multiline mode', () => {
expect(() => parseNDJSON(validNDJSONWithSpaces, true)).toThrow();
});

it('should not parse multiline NDJSON in single-line mode', () => {
expect(() => parseNDJSON(multilineNDJSON, false)).toThrow();
});

it('should parse multiline NDJSON in multiline mode', () => {
expect(parseNDJSON(multilineNDJSON, true)).toEqual(content);
});

it('should parse single-line JSON Array', () => {
expect(parseNDJSON(singlelineArray, false)).toEqual([content]);
expect(parseNDJSON(singlelineArray, true)).toEqual([content]);
});

it('should not parse a multi-line JSON Array', () => {
expect(() => parseNDJSON(multilineArray, false)).toThrow();
expect(() => parseNDJSON(multilineArray, true)).toThrow();
});

it('should parse single-line JSON with one entry', () => {
const fileContent = '{"message":"test message 1"}';
expect(parseNDJSON(fileContent)).toEqual([{ message: 'test message 1' }]);
});

it('should handle empty content', () => {
expect(parseNDJSON(' ', false)).toEqual([]);
expect(parseNDJSON(' ', true)).toEqual([]);
});

it('should handle empty lines in file content', () => {
const fileContent = '\n\n{"message":"test message 1"}\n\n{"message":"test message 2"}\n\n';
expect(parseNDJSON(fileContent, false)).toEqual(content);
expect(parseNDJSON(fileContent, true)).toEqual(content);
});
});

describe('parseJSONArray', () => {
const content = [{ message: 'test message 1' }, { message: 'test message 2' }];
const singlelineArray = '[{"message":"test message 1"},{"message":"test message 2"}]';
const multilineArray = '[{"message":"test message 1"},\n{"message":"test message 2"}]';
const multilineWithSpacesArray =
' [ \n\n{"message": "test message 1"},\n{"message" :\n\n"test message 2"}\n]\n';
const malformedJSON = '[{"message":"test message 1"}';

it('should parse valid JSON array', () => {
const expected = {
entries: content,
pathToEntries: [],
errorNoArrayFound: false,
};
expect(parseJSONArray(singlelineArray)).toEqual(expected);
expect(parseJSONArray(multilineArray)).toEqual(expected);
expect(parseJSONArray(multilineWithSpacesArray)).toEqual(expected);
});

it('should parse valid JSON object with array entries', () => {
const expected = {
entries: content,
pathToEntries: ['events'],
errorNoArrayFound: false,
};
expect(parseJSONArray(complexEventsJSON)).toEqual(expected);
});

it('should pass even if the JSON object with array entries has not an identifier-like key', () => {
const expected = {
entries: content,
pathToEntries: ['1event'],
errorNoArrayFound: false,
};
expect(parseJSONArray(nonIdentifierLikeKeyInJSON)).toEqual(expected);
});

it('should return error for JSON that does not contain an array', () => {
const fileContent = '{"records" : {"message": "test message 1"}}';
const expected = {
entries: [],
pathToEntries: [],
errorNoArrayFound: true,
};
expect(parseJSONArray(fileContent)).toEqual(expected);
});

it('should throw an error for invalid JSON object', () => {
expect(() => parseJSONArray(malformedJSON)).toThrow();
});
});

describe('SampleLogsInput', () => {
let result: RenderResult;
let input: HTMLElement;
Expand All @@ -49,6 +162,7 @@ describe('SampleLogsInput', () => {
it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: logsSampleRaw.split(','),
samplesFormat: { name: 'json', json_path: [] },
});
});

Expand All @@ -61,6 +175,7 @@ describe('SampleLogsInput', () => {
it('should truncate the logs sample', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: tooLargeLogsSample.split(',').slice(0, 10),
samplesFormat: { name: 'json', json_path: [] },
});
});
it('should add a notification toast', () => {
Expand All @@ -71,6 +186,19 @@ describe('SampleLogsInput', () => {
});
});

describe('when the file is a json array under a key', () => {
beforeEach(async () => {
await changeFile(input, new File([complexEventsJSON], 'test.json', { type }));
});

it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: splitNDJSON,
samplesFormat: { name: 'json', json_path: ['events'] },
});
});
});

describe('when the file is invalid', () => {
describe.each([
[
Expand All @@ -91,6 +219,7 @@ describe('SampleLogsInput', () => {
it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: undefined,
samplesFormat: undefined,
});
});
});
Expand All @@ -101,26 +230,27 @@ describe('SampleLogsInput', () => {
const type = 'application/x-ndjson';

describe('when the file is valid ndjson', () => {
const logsSampleRaw = `{"message":"test message 1"}\n{"message":"test message 2"}`;
beforeEach(async () => {
await changeFile(input, new File([logsSampleRaw], 'test.json', { type }));
await changeFile(input, new File([simpleNDJSON], 'test.json', { type }));
});

it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: logsSampleRaw.split('\n'),
logsSampleParsed: splitNDJSON,
samplesFormat: { name: 'ndjson', multiline: false },
});
});

describe('when the file has too many rows', () => {
const tooLargeLogsSample = Array(6).fill(logsSampleRaw).join('\n'); // 12 entries
const tooLargeLogsSample = Array(6).fill(simpleNDJSON).join('\n'); // 12 entries
beforeEach(async () => {
await changeFile(input, new File([tooLargeLogsSample], 'test.json', { type }));
});

it('should truncate the logs sample', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: tooLargeLogsSample.split('\n').slice(0, 10),
samplesFormat: { name: 'ndjson', multiline: false },
});
});
it('should add a notification toast', () => {
Expand All @@ -131,6 +261,32 @@ describe('SampleLogsInput', () => {
});
});

describe('when the file is a an ndjson with a single record', () => {
beforeEach(async () => {
await changeFile(input, new File([multilineNDJSON.split('\n')[0]], 'test.json', { type }));
});

it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: [splitNDJSON[0]],
samplesFormat: { name: 'ndjson', multiline: false },
});
});
});

describe('when the file is multiline ndjson', () => {
beforeEach(async () => {
await changeFile(input, new File([multilineNDJSON], 'test.json', { type }));
});

it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: splitNDJSON,
samplesFormat: { name: 'ndjson', multiline: true },
});
});
});

describe('when the file is invalid', () => {
describe.each([
[
Expand All @@ -151,6 +307,7 @@ describe('SampleLogsInput', () => {
it('should set the integrationSetting correctly', () => {
expect(mockActions.setIntegrationSettings).toBeCalledWith({
logsSampleParsed: undefined,
samplesFormat: undefined,
});
});
});
Expand Down
Loading

0 comments on commit 2a8b6d0

Please sign in to comment.