From d8c1b469d1e263ac904539f69e586c110a4b47c1 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 19 Dec 2024 16:42:10 -0500 Subject: [PATCH] Revert the file_identity back to native --- CHANGELOG.next.asciidoc | 6 ++--- .../config/filebeat.inputs.reference.yml.tmpl | 7 +++--- .../docs/inputs/input-filestream.asciidoc | 25 +++++++++++++------ filebeat/filebeat.reference.yml | 7 +++--- filebeat/input/filestream/fswatch.go | 2 +- filebeat/input/filestream/identifier.go | 2 +- x-pack/filebeat/filebeat.reference.yml | 7 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index ff1c7fea449..b59e414d4b4 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -49,11 +49,9 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Remove deprecated awscloudwatch field from Filebeat. {pull}41089[41089] - The performance of ingesting SQS data with the S3 input has improved by up to 60x for queues with many small events. `max_number_of_messages` config for SQS mode is now ignored, as the new design no longer needs a manual cap on messages. Instead, use `number_of_workers` to scale ingestion rate in both S3 and SQS modes. The increased efficiency may increase network bandwidth consumption, which can be throttled by lowering `number_of_workers`. It may also increase number of events stored in memory, which can be throttled by lowering the configured size of the internal queue. {pull}40699[40699] - Fixes filestream logging the error "filestream input with ID 'ID' already exists, this will lead to data duplication[...]" on Kubernetes when using autodiscover. {pull}41585[41585] - - Add kafka compression support for ZSTD. - - Filebeat fails to start if there is any input with a duplicated ID. It logs the duplicated IDs and the offending inputs configurations. {pull}41731[41731] -- The Filestream input only starts to ingest a file when it is >= 1024 bytes in size. This happens because the fingerprint` is the default file identity now. To restore the previous behaviour, set `file_identity.native: ~` and `prospector.scanner.fingerprint.enabled: false` {issue}40197[40197] {pull}41762[41762] + *Heartbeat* @@ -359,7 +357,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Add support for SSL and Proxy configurations for websoket type in streaming input. {pull}41934[41934] - AWS S3 input registry cleanup for untracked s3 objects. {pull}41694[41694] - The environment variable `BEATS_AZURE_EVENTHUB_INPUT_TRACING_ENABLED: true` enables internal logs tracer for the azure-eventhub input. {issue}41931[41931] {pull}41932[41932] -- The Filestream input now uses the `fingerprint` file identity by default. The state from files are automatically migrated if the previous file identity was `native` (the default) or `path`. If the `file_identity` is explicitly set, there is no change in behaviour. {issue}40197[40197] {pull}41762[41762] +- The Filestream input can automatically migrate state from files when changing the `file_identity` if the previous file identity was `native` (the default) or `path`. {issue}40197[40197] {pull}41762[41762] - Rate limiting operability improvements in the Okta provider of the Entity Analytics input. {issue}40106[40106] {pull}41977[41977] - Added default values in the streaming input for websocket retries and put a cap on retry wait time to be lesser than equal to the maximum defined wait time. {pull}42012[42012] diff --git a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl index 5e44bcdb09e..ba658819582 100644 --- a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl +++ b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl @@ -303,7 +303,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: true + #prospector.scanner.fingerprint.enabled: false # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -438,9 +438,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # a fingerprint is generated using the first 1024 bytes of the file, - # if the fingerprints match, then the files are considered equal. - #file_identity.fingerprint: ~ + # the Beat considers two files the same if their inode and device id are the same. + #file_identity.native: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index 74b7514b91a..96ba5e273e5 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -34,10 +34,11 @@ The `log` writes the complete file state. 7. Stale entries can be removed from the registry, even if there is no active input. -8. The default behaviour is to identify files based on their contents -using the <> <> This solves data duplication caused by inode reuse. +8. The input can identify files based on their contents when using the +<> +<> instead +of the default inode and device ID. This solves data duplication +caused by inode reuse. To configure this input, specify a list of glob-based <> that must be crawled to locate and fetch the log lines. @@ -93,7 +94,15 @@ multiple input sections: WARNING: Some file identity methods do not support reading from network shares and cloud providers, to avoid duplicating events, use -the default `file_identity`: `fingerprint`. +`fingerprint` when reading from network shares or cloud providers. + +By default, {beatname_uc} identifies files based on their inodes and +device IDs. However, on network shares and cloud providers these +values might change during the lifetime of the file. If this happens +{beatname_uc} thinks that file is new and resends the whole content +of the file. To solve this problem you can configure the `file_identity` option. Possible +values besides the default `native` (inode + device ID) are +`fingerprint`, `path` and `inode_marker`. IMPORTANT: Changing `file_identity` is only supported when migrating from `native` or `path` to `fingerprint`. @@ -101,7 +110,7 @@ migrating from `native` or `path` to `fingerprint`. WARNING: Any unsupported change in `file_identity` methods between runs may result in duplicated events in the output. -`fingerprint` is the default and recommended file identity because it does not +`fingerprint` is the recommended file identity because it does not rely on the file system/OS, it generates a hash from a portion of the file (the first 1024 bytes, by default) and uses that to identify the file. This works well with log rotation strategies that move/rename @@ -109,7 +118,9 @@ the file and on Windows as file identifiers might be more volatile. The downside is that {beatname_uc} will wait until the file reaches 1024 bytes before start ingesting any file. -WARNING: Once this file identity is enabled, changing +WARNING: In order to use this file identity option, one must enable +the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint +option in the scanner>>. Once this file identity is enabled, changing the fingerprint configuration (offset, length, etc) will lead to a global re-ingestion of all files that match the paths configuration of the input. diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml index 0b35505cd33..be189fdfd1c 100644 --- a/filebeat/filebeat.reference.yml +++ b/filebeat/filebeat.reference.yml @@ -716,7 +716,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: true + #prospector.scanner.fingerprint.enabled: false # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -851,9 +851,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # a fingerprint is generated using the first 1024 bytes of the file, - # if the fingerprints match, then the files are considered equal. - #file_identity.fingerprint: ~ + # the Beat considers two files the same if their inode and device id are the same. + #file_identity.native: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering diff --git a/filebeat/input/filestream/fswatch.go b/filebeat/input/filestream/fswatch.go index 00d84ed9ab4..c51d850bbd2 100644 --- a/filebeat/input/filestream/fswatch.go +++ b/filebeat/input/filestream/fswatch.go @@ -278,7 +278,7 @@ func defaultFileScannerConfig() fileScannerConfig { Symlinks: false, RecursiveGlob: true, Fingerprint: fingerprintConfig{ - Enabled: true, + Enabled: false, Offset: 0, Length: DefaultFingerprintSize, }, diff --git a/filebeat/input/filestream/identifier.go b/filebeat/input/filestream/identifier.go index 08bb0c5f071..a0cd7903e7a 100644 --- a/filebeat/input/filestream/identifier.go +++ b/filebeat/input/filestream/identifier.go @@ -76,7 +76,7 @@ func (f fileSource) Name() string { // newFileIdentifier creates a new state identifier for a log input. func newFileIdentifier(ns *conf.Namespace, suffix string) (fileIdentifier, error) { if ns == nil { - i, err := newFingerprintIdentifier(nil) + i, err := newINodeDeviceIdentifier(nil) if err != nil { return nil, err } diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index 1b560be40f1..0c2eb0c0c51 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -2400,7 +2400,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: true + #prospector.scanner.fingerprint.enabled: false # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -2535,9 +2535,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # a fingerprint is generated using the first 1024 bytes of the file, - # if the fingerprints match, then the files are considered equal. - #file_identity.fingerprint: ~ + # the Beat considers two files the same if their inode and device id are the same. + #file_identity.native: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering