diff --git a/tensorflow_io/pcap/BUILD b/tensorflow_io/pcap/BUILD new file mode 100644 index 000000000..7bc561347 --- /dev/null +++ b/tensorflow_io/pcap/BUILD @@ -0,0 +1,22 @@ +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +cc_binary( + name = "python/ops/_pcap_ops.so", + srcs = [ + "kernels/pcap_input.cc", + "ops/pcap_ops.cc", + ], + copts = [ + "-pthread", + "-std=c++11", + "-DNDEBUG", + ], + linkshared = 1, + deps = [ + "//tensorflow_io/core:dataset_ops", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], +) diff --git a/tensorflow_io/pcap/__init__.py b/tensorflow_io/pcap/__init__.py new file mode 100644 index 000000000..d8d0aeafd --- /dev/null +++ b/tensorflow_io/pcap/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""PcapInput + +@@PcapDataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.util.all_util import remove_undocumented +from tensorflow_io.pcap.python.ops.pcap_ops import PcapDataset + +_allowed_symbols = [ + "PcapDataset", +] + +remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) diff --git a/tensorflow_io/pcap/kernels/pcap_input.cc b/tensorflow_io/pcap/kernels/pcap_input.cc new file mode 100644 index 000000000..ba877d416 --- /dev/null +++ b/tensorflow_io/pcap/kernels/pcap_input.cc @@ -0,0 +1,207 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernels/dataset_ops.h" +#include "tensorflow/core/lib/io/buffered_inputstream.h" + +namespace tensorflow { +namespace data { + +class PcapInputStream : public io::BufferedInputStream { +public: + + const uint32_t MAGIC_NUMBER = 0xA1B2C3D4; // magic number for big endian machine with microsecond resolution + const uint32_t MAGIC_NUMBER_REVERSED = 0xD4C3B2A1; // magic number for little endian machine with microsecond resolution + + const uint16_t PCAP_ERRBUF_SIZE = 256; + const uint16_t PCAP_TSTAMP_PRECISION_MICRO = 0; // use timestamps with microsecond precision by default + + explicit PcapInputStream(InputStreamInterface* input_stream) + : io::BufferedInputStream(input_stream, 256 * 1024) { + } + + Status ReadRecord(double& timestamp, string* packet_data, int64& record_read) { + string buffer; + buffer.clear(); + + // read packet header + TF_RETURN_IF_ERROR(ReadNBytes(sizeof(struct PacketHeader), &buffer)); + struct PacketHeader *header = (struct PacketHeader *)buffer.data(); + + if (reverse_header_byte_order) { + // switch byte order to get accurate representation of field values + EndianSwap(header->ts_sec); + EndianSwap(header->ts_msec); + EndianSwap(header->caplen); + EndianSwap(header->orig_len); + } + + // Combine date and time in seconds plus milliseconds offset into one composite value + timestamp = header->ts_sec + (header->ts_msec / 1e6); + + // read packet data + TF_RETURN_IF_ERROR(ReadNBytes(header->caplen, packet_data)); + + record_read = 1; // this method reads one packet at a time from the input buffer + + return Status::OK(); + } + + Status ReadHeader() { + string buffer; + // read file header + TF_RETURN_IF_ERROR(ReadNBytes(sizeof(struct PcapFileHeader), &buffer)); + struct PcapFileHeader *header = (struct PcapFileHeader *)buffer.data(); + + if(!ValidateMagicNumber(header->magic_number)) { + return errors::InvalidArgument("PCAP file must starts with a standard magic number."); + } + + if (reverse_header_byte_order) { + // switch byte order to get accurate representation of packet slices + // snaplen will be needed to reconstruct sliced payloads spread across multiple pcap packets + EndianSwap(header->snaplen); + } + return Status::OK(); + } + +private: + + /** + PcapFileHeader format: (https://wiki.wireshark.org/Development/LibpcapFileFormat) + magic_number: used to detect the file format itself and the byte ordering. The writing application writes 0xa1b2c3d4 with it's native byte ordering format into this field. The reading application will read either 0xa1b2c3d4 (identical) or 0xd4c3b2a1 (swapped). If the reading application reads the swapped 0xd4c3b2a1 value, it knows that all the following fields will have to be swapped too. For nanosecond-resolution files, the writing application writes 0xa1b23c4d, with the two nibbles of the two lower-order bytes swapped, and the reading application will read either 0xa1b23c4d (identical) or 0x4d3cb2a1 (swapped). + version_major, version_minor: the version number of this file format (current version is 2.4) + thiszone: the correction time in seconds between GMT (UTC) and the local timezone of the following packet header timestamps. Examples: If the timestamps are in GMT (UTC), thiszone is simply 0. If the timestamps are in Central European time (Amsterdam, Berlin, ...) which is GMT + 1:00, thiszone must be -3600. In practice, time stamps are always in GMT, so thiszone is always 0. + sigfigs: in theory, the accuracy of time stamps in the capture; in practice, all tools set it to 0 + snaplen: the "snapshot length" for the capture (typically 65535 or even more, but might be limited by the user), see: incl_len vs. orig_len below + linktype: link-layer header type, specifying the type of headers at the beginning of the packet (e.g. 1 for Ethernet, see tcpdump.org's link-layer header types page for details); this can be various types such as 802.11, 802.11 with various radio information, PPP, Token Ring, FDDI, etc. + */ + struct PcapFileHeader { + uint32_t magic_number; + uint16_t version_major; + uint16_t version_minor; + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + uint32_t linktype; + }; + + + /** + PacketHeader format: + ts_sec: the date and time when this packet was captured. This value is in seconds since January 1, 1970 00:00:00 GMT; this is also known as a UN*X time_t. You can use the ANSI C time() function from time.h to get this value, but you might use a more optimized way to get this timestamp value. If this timestamp isn't based on GMT (UTC), use thiszone from the global header for adjustments. + ts_msec: in regular pcap files, the microseconds when this packet was captured, as an offset to ts_sec. In nanosecond-resolution files, this is, instead, the nanoseconds when the packet was captured, as an offset to ts_sec /!\ Beware: this value shouldn't reach 1 second (in regular pcap files 1 000 000; in nanosecond-resolution files, 1 000 000 000); in this case ts_sec must be increased instead! + caplen: the number of bytes of packet data actually captured and saved in the file. This value should never become larger than orig_len or the snaplen value of the global header. + orig_len: the length of the packet as it appeared on the network when it was captured. If caplen and orig_len differ, the actually saved packet size was limited by snaplen. + */ + struct PacketHeader + { + uint32_t ts_sec; + uint32_t ts_msec; + uint32_t caplen; + uint32_t orig_len; + }; + + inline void EndianSwap(uint16_t& x) + { + x = (x>>8) | + (x<<8); + } + + inline void EndianSwap(uint32_t& x) + { + x = (x>>24) | + ((x<<8) & 0x00FF0000) | + ((x>>8) & 0x0000FF00) | + (x<<24); + } + + bool reverse_header_byte_order = false; // is the pcap file using little endian or big endian byte order + + /** + Check for the magic numbers of the two most typical pcap formats used in practice. + */ + bool ValidateMagicNumber(uint32_t magic_number) { + if (magic_number == MAGIC_NUMBER) { + return true; + } else if (magic_number == MAGIC_NUMBER_REVERSED) { + reverse_header_byte_order = true; + return true; + } else { + return false; + } + } + +}; // end of class PcapInputStream + + +class PcapInput: public FileInput { + public: + Status ReadRecord(io::InputStreamInterface* s, IteratorContext* ctx, std::unique_ptr& state, int64 record_to_read, int64* record_read, std::vector* out_tensors) const override { + if (state.get() == nullptr) { + state.reset(new PcapInputStream(s)); + TF_RETURN_IF_ERROR(state.get()->ReadHeader()); + } + // Let's allocate enough space for Tensor, if more than read, replace. + // The output tensor has two columns (packet_timestamp,packet_data). + // Hence the shape of the output tensor is (record_to_read,2) unless there are less than record_to_read packets left in the file + Tensor tensor_packet_ts(ctx->allocator({}), DT_DOUBLE, {record_to_read}); // Tensor column for packet timestamps + out_tensors->emplace_back(std::move(tensor_packet_ts)); // add timestamp column to the output tensor + Tensor tensor_packet_data(ctx->allocator({}), DT_STRING, {record_to_read}); // Tensor column for packet data + out_tensors->emplace_back(std::move(tensor_packet_data)); // add data column to the output tensor + + // read packets from the file up to record_to_read or end of file + while ((*record_read) < record_to_read) { + int64 record_count = 0; + double packet_timestamp; + string packet_data_buffer; + Status status = state.get()->ReadRecord(packet_timestamp, &packet_data_buffer, record_count); + if (!(status.ok() || errors::IsOutOfRange(status))) { + return status; + } + if (record_count > 0) { + Tensor timestamp_tensor = (*out_tensors)[0]; + timestamp_tensor.flat()(*record_read) = packet_timestamp; + Tensor data_tensor = (*out_tensors)[1]; + data_tensor.flat()(*record_read) = std::move(packet_data_buffer); + (*record_read) += record_count; + } else { + // no more records available to read + // record_count == 0 + break; + } + } + return Status::OK(); + } + Status FromStream(io::InputStreamInterface* s) override { + return Status::OK(); + } + void EncodeAttributes(VariantTensorData* data) const override { + } + bool DecodeAttributes(const VariantTensorData& data) override { + return true; + } + protected: +}; + +REGISTER_UNARY_VARIANT_DECODE_FUNCTION(PcapInput, "tensorflow::data::PcapInput"); + +REGISTER_KERNEL_BUILDER(Name("PcapInput").Device(DEVICE_CPU), + FileInputOp); +REGISTER_KERNEL_BUILDER(Name("PcapDataset").Device(DEVICE_CPU), + FileInputDatasetOp); + +} // namespace data +} // namespace tensorflow diff --git a/tensorflow_io/pcap/ops/pcap_ops.cc b/tensorflow_io/pcap/ops/pcap_ops.cc new file mode 100644 index 000000000..4fb9903f0 --- /dev/null +++ b/tensorflow_io/pcap/ops/pcap_ops.cc @@ -0,0 +1,46 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +REGISTER_OP("PcapInput") + .Input("source: string") + .Output("handle: variant") + .Attr("filters: list(string) = []") + .Attr("columns: list(string) = []") + .Attr("schema: string = ''") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->MakeShape({c->UnknownDim()})); + return Status::OK(); + }); + +REGISTER_OP("PcapDataset") + .Input("input: T") + .Input("batch: int64") + .Output("handle: variant") + .Attr("output_types: list(type) >= 1") + .Attr("output_shapes: list(shape) >= 1") + .Attr("T: {string, variant} = DT_VARIANT") + .SetIsStateful() + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->MakeShape({})); + return Status::OK(); + }); + +} // namespace tensorflow diff --git a/tensorflow_io/pcap/python/__init__.py b/tensorflow_io/pcap/python/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tensorflow_io/pcap/python/ops/__init__.py b/tensorflow_io/pcap/python/ops/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tensorflow_io/pcap/python/ops/pcap_ops.py b/tensorflow_io/pcap/python/ops/pcap_ops.py new file mode 100644 index 000000000..5fa527d5b --- /dev/null +++ b/tensorflow_io/pcap/python/ops/pcap_ops.py @@ -0,0 +1,40 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""PcapDataset""" +import tensorflow as tf +from tensorflow_io.core.python.ops import data_ops as data_ops +from tensorflow_io import _load_library +pcap_ops = _load_library('_pcap_ops.so') + + +class PcapDataset(data_ops.Dataset): + """A pcap Dataset. Pcap is a popular file format for capturing network packets. + """ + + def __init__(self, filenames, batch=None): + """Create a pcap Reader. + + Args: + filenames: A `tf.string` tensor containing one or more filenames. + """ + batch = 0 if batch is None else batch + dtypes = [tf.float64, tf.string] + shapes = [ + tf.TensorShape([]), tf.TensorShape([])] if batch == 0 else [ + tf.TensorShape([None]), tf.TensorShape([None])] + super(PcapDataset, self).__init__( + pcap_ops.pcap_dataset, + pcap_ops.pcap_input(filenames), + batch, dtypes, shapes) diff --git a/tests/test_pcap/http.pcap b/tests/test_pcap/http.pcap new file mode 100644 index 000000000..54f6f2953 Binary files /dev/null and b/tests/test_pcap/http.pcap differ diff --git a/tests/test_pcap_eager.py b/tests/test_pcap_eager.py new file mode 100644 index 000000000..7904e75d9 --- /dev/null +++ b/tests/test_pcap_eager.py @@ -0,0 +1,50 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +""" +Test PcapDataset +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tensorflow as tf +import tensorflow_io.pcap as pcap_io # pylint: disable=wrong-import-position + +if not (hasattr(tf, "version") and tf.version.VERSION.startswith("2.")): + tf.compat.v1.enable_eager_execution() + + +def test_pcap_input(): + """test_pcap_input + """ + print("Testing PcapDataset") + pcap_filename = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_pcap", "http.pcap") + file_url = "file://" + pcap_filename + url_filenames = [file_url] + dataset = pcap_io.PcapDataset(url_filenames, batch=1) + + packets_total = 0 + for v in dataset: + (packet_timestamp, packet_data) = v + if packets_total == 0: + assert packet_timestamp.numpy()[0] == 1084443427.311224 # we know this is the correct value in the test pcap file + assert len(packet_data.numpy()[0]) == 62 # we know this is the correct packet data buffer length in the test pcap file + packets_total += 1 + assert packets_total == 43 # we know this is the correct number of packets in the test pcap file + +if __name__ == "__main__": + test.main()