From 7d098e42dfd08ea1f2e63355e2a95c2b38e3b768 Mon Sep 17 00:00:00 2001 From: Nathan Fox Date: Fri, 30 Jun 2023 10:59:08 -0400 Subject: [PATCH] chore(docs): Add Log Namespacing docs (#16571) This updates documentation and adds a blog-post announcing the log namespacing feature (as a beta release). --------- Co-authored-by: Spencer Gilbert --- website/content/en/blog/log-namespacing.md | 169 ++++++++++++++++++ website/cue/reference/configuration.cue | 20 ++- .../remap/functions/set_semantic_meaning.cue | 44 +++++ website/cue/reference/urls.cue | 1 + website/layouts/partials/data.html | 33 ++++ 5 files changed, 263 insertions(+), 4 deletions(-) create mode 100644 website/content/en/blog/log-namespacing.md create mode 100644 website/cue/reference/remap/functions/set_semantic_meaning.cue diff --git a/website/content/en/blog/log-namespacing.md b/website/content/en/blog/log-namespacing.md new file mode 100644 index 0000000000000..4516b5685794b --- /dev/null +++ b/website/content/en/blog/log-namespacing.md @@ -0,0 +1,169 @@ +--- +title: Log Namespacing +short: Log Namespacing +description: Changing Vector's data model +authors: ["fuchsnj"] +date: "2023-06-30" +badges: + type: announcement + domains: ["data model"] +tags: [] +--- + +The Vector team has been hard at work improving the data model of events in Vector. These +changes are now available for beta testing for those who want to try it out and give feedback. +This is an opt-in feature. Nothing should change unless you specifically enable it. + +## Why + +Currently, all data for events is placed at the root of the event, regardless of where the data came +from or how it was obtained. Not only can that make it confusing to understand what a certain field +represents (eg: was the `timestamp` field generated by Vector when it was ingested, or is it when +the source originally created the event) but it can easily cause data collisions. + +Log namespacing also unblocks powerful features being worked on, such as end-to-end type checking +of events in Vector. + +## How to enable + +The [global config] `schema.log_namespace` can be set to `true` to enable the new +Log Namespacing feature for all components. The default is `false`. + +Every source also has a `log_namespace` config option. This will override the global setting, +so you can try out Log Namespacing on individual sources. + +The following example enables the `log_namespace` feature globally, then disables it for a single +source. + +```toml +schema.log_namespace = true + +[sources.input_with_log_namespace] +type = "demo_logs" +format = "shuffle" +lines = ["input_with_log_namespace"] +interval = 1 + +[sources.input_without_log_namespace] +type = "demo_logs" +format = "shuffle" +lines = ["input_without_log_namespace"] +interval = 1 +log_namespace = false + +[sinks.console] +type = "console" +inputs = ["input_with_log_namespace", "input_without_log_namespace"] +encoding.codec = "json" + +``` + +## How It Works + +### Data Layout + +When handling log events, information is categorized into one of the following groups: +(Examples are from the `datadog_agent` source) + +- Event Data: The decoded event data. (eg: the log itself) +- Source Metadata: Metadata provided by the source of the event. (eg: hostname / tags) +- Vector Metadata: Metadata provided by Vector. (eg: the time when Vector received the event) + +#### Without Log Namespacing + +All three of these are placed at the root of the event. The exact layout depends on the source, +some fields are configurable, and the [global log schema] can change the name / location of some +fields. + +Example log event from the `datadog_agent` source (with the JSON decoder) + +```json +{ + "ddsource": "vector", + "ddtags": "env:prod", + "hostname": "alpha", + "foo": "foo field", + "service": "cernan", + "source_type": "datadog_agent", + "bar": "bar field", + "status": "warning", + "timestamp": "1970-02-14T20:44:57.570Z" +} +``` + +#### With Log Namespacing + +When enabled, the layout of this data is well-defined and consistent. + +Event Data (and _only_ Event Data) is placed at the root of the event (eg: `.`). +Source metadata is placed in event metadata, prefixed by the source name. (eg: `%datadog_agent`) +Vector metadata is placed in event metadata, prefixed by `vector`. (eg: `%vector`) + +Generally sinks will only send the event data. If you want to include any metadata fields, +it's recommended to use a [remap] transform to add data to the event as needed. + +It's important to note that previously the type of an event (`.`) would always be an object +with fields. Now it is possible for event to be any type, such as a string. + +Example log event from the `datadog agent` source. (same data as the example above) + +Event root (`.`) + +```json +{ + "foo": "foo field", + "bar": "bar field" +} +``` + +Source metadata fields (`%datadog_agent`) + +```json +{ + "ddsource": "vector", + "ddtags": "env:prod", + "hostname": "alpha", + "service": "cernan", + "status": "warning", + "timestamp": "1970-02-14T20:44:57.570Z" +} +``` + +Source vector fields (`%vector`) + +```json +{ + "source_type": "datadog_agent", + "ingest_timestamp": "1970-02-14T20:44:58.236Z" +} +``` + +Here is a sample VRL script accessing different parts of an event when log namespacing is enabled. + +```coffee +event = . +field_from_event = .foo + +all_metadata = % +tags = %datadog_agent.ddtags +timestamp = %vector.ingest_timestamp + +``` + +### Semantic Meaning + +Before Log Namespacing, Vector used the [global log schema] to keep certain types of information +at known locations. This is changing, and when log namespacing is enabled, the [global log schema] +will no longer be used. To replace it, a new feature called "semantic meaning" will be used instead. +This allows assigning meaning to different fields of an event, which allows sinks to access +information needed, such as timestamps, hostname, the message, etc. + +Semantic meaning will automatically be assigned by all sources. Sinks will check on startup to make +sure a meaning exists for all required fields. If a source does not provide a required field, or +a meaning needs to be manually adjusted for any reason, the VRL function [set_semantic_meaning] can +be used. + +[global log schema]: /docs/reference/configuration/global-options/#log_schema +[set_semantic_meaning]: /docs/reference/vrl/functions/#set_semantic_meaning +[remap]: /docs/reference/configuration/transforms/remap/ +[global config]: /docs/reference/configuration/global-options/#log_namespacing diff --git a/website/cue/reference/configuration.cue b/website/cue/reference/configuration.cue index d1a1476d3de38..08d9b0bb6b88d 100644 --- a/website/cue/reference/configuration.cue +++ b/website/cue/reference/configuration.cue @@ -251,6 +251,17 @@ configuration: { } } } + log_namespacing: { + common: false + description: """ + Globally enables / disables log namespacing. See [Log Namespacing](\(urls.log_namespacing_blog)) + for more details. If you want to enable individual sources, there is a config + option in the source configuration. + """ + required: false + warnings: [] + type: bool: default: false + } telemetry: { common: false @@ -274,7 +285,7 @@ configuration: { common: true description: """ Add a `source` tag with the source component the event was received from. - + If there is no source component, for example if the event was generated by the `lua` transform a `-` is emitted for this tag. """ @@ -309,13 +320,14 @@ configuration: { } log_schema: { - common: false + common: false description: """ Configures default log schema for all events. This is used by - Vector source components to assign the fields on incoming + Vector components to assign the fields on incoming events. + These values are ignored if log namespacing is enabled. (See [Log Namespacing](\(urls.log_namespacing_blog))) """ - required: false + required: false type: object: { examples: [] options: { diff --git a/website/cue/reference/remap/functions/set_semantic_meaning.cue b/website/cue/reference/remap/functions/set_semantic_meaning.cue new file mode 100644 index 0000000000000..d21ca5b121b27 --- /dev/null +++ b/website/cue/reference/remap/functions/set_semantic_meaning.cue @@ -0,0 +1,44 @@ +package metadata + +remap: functions: set_semantic_meaning: { + category: "Event" + description: """ + Sets a semantic meaning for an event. Note that this function assigns + meaning at Vector startup, and has _no_ runtime behavior. It is suggested + to put all calls to this function at the beginning of a VRL function. The function + cannot be conditionally called (eg: using an if statement cannot stop the meaning + from being assigned). + """ + + arguments: [ + { + name: "target" + description: """ + The path of the value that will be assigned a meaning. + """ + required: true + type: ["path"] + }, + { + name: "meaning" + description: """ + The name of the meaning to assign. + """ + required: true + type: ["string"] + }, + ] + internal_failure_reasons: [ + ] + return: types: ["null"] + + examples: [ + { + title: "Sets custom field semantic meaning" + source: #""" + set_semantic_meaning(.foo, "bar") + """# + return: null + }, + ] +} diff --git a/website/cue/reference/urls.cue b/website/cue/reference/urls.cue index 0e3560392c108..2f6921ba39eea 100644 --- a/website/cue/reference/urls.cue +++ b/website/cue/reference/urls.cue @@ -313,6 +313,7 @@ urls: { logfmt_specs: "https://pkg.go.dev/github.com/kr/logfmt#section-documentation" logstash: "https://www.elastic.co/logstash" logstash_protocol: "https://github.com/elastic/logstash-forwarder/blob/master/PROTOCOL.md" + log_namespacing_blog: "/blog/log-namespacing/" loki: "https://grafana.com/oss/loki/" loki_multi_tenancy: "\(github)/grafana/loki/blob/master/docs/operations/multi-tenancy.md" log_event_source: "\(vector_repo)/blob/master/src/event/" diff --git a/website/layouts/partials/data.html b/website/layouts/partials/data.html index 9a7b71d4fd8a1..3feefe90cd353 100644 --- a/website/layouts/partials/data.html +++ b/website/layouts/partials/data.html @@ -257,6 +257,39 @@ +
+ +

+ Warning + +

+
+
+
+ + + +
+
The fields shown below will be + different if log namespacing is enabled. + See Log Namespacing for + more details +
+
+
+
{{ template "logs_output" . }}