From a1e41eef80e8d64cefbe9dba1e1619abc660dc67 Mon Sep 17 00:00:00 2001 From: Matisse Callewaert Date: Wed, 14 Feb 2024 19:59:15 +0100 Subject: [PATCH 1/4] :sparkles: Add Cli parsing --- Cargo.toml | 1 + src/args.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 11 +++++++++-- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 src/args.rs diff --git a/Cargo.toml b/Cargo.toml index fcb6352..f1f912e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +clap = { version = "4.5.0", features = ["derive"] } diff --git a/src/args.rs b/src/args.rs new file mode 100644 index 0000000..ccfb790 --- /dev/null +++ b/src/args.rs @@ -0,0 +1,53 @@ +use clap:: { + Parser, + Subcommand, +}; + +#[derive(Debug, Parser)] +#[clap(author, version, about)] +pub struct Cli { + #[clap(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand)] +pub enum Commands { + /// Real-time feature extraction + Realtime, + + /// Feature extraction from a dataset + Dataset { + #[clap(value_enum)] + dataset: Dataset, + + /// The relative path to the dataset + path: String, + }, +} + +#[derive(clap::ValueEnum, Clone, Debug)] +pub enum Dataset { + /// CIC-IDS2017 from the Canadian Institute for Cybersecurity + CicIds2017, + + /// CSE-CIC-IDS2018 from the Canadian Institute for Cybersecurity + CseCicIds2018, + + /// CIC-DDoS2019 from the Canadian Institute for Cybersecurity + CicDdos2019, + + /// CIC-IDS-Collection from Laurens D'Hooge + CicIdsCollection, + + /// CTU-13 from CTU university of the Czech Republic + Ctu13, + + /// CTU-13 without contaminant features from Laurens D'Hooge + Ctu13Ld, + + /// UNSW-NB15 from UNSW Sydney + UnswNb15, + + /// UNSW-NB15 without contaminant features from Laurens D'Hooge + UnswNb15Ld, +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e7a11a9..39c5f25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,10 @@ +mod args; + +use args::Cli; +use clap::Parser; + fn main() { - println!("Hello, world!"); -} + let args = Cli::parse(); + + print!("{:?}\n", args); +} \ No newline at end of file From fb593328effef58f08939dc99daf47460c076537 Mon Sep 17 00:00:00 2001 From: Matisse Callewaert Date: Fri, 16 Feb 2024 10:56:51 +0100 Subject: [PATCH 2/4] :art: Add rust formatter --- rustfmt.toml | 1 + 1 file changed, 1 insertion(+) create mode 100644 rustfmt.toml diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..d100efd --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +max_width = 100 \ No newline at end of file From 5bca9eefb715a198d47d842f26c62c8696386731 Mon Sep 17 00:00:00 2001 From: Matisse Callewaert Date: Fri, 16 Feb 2024 10:58:18 +0100 Subject: [PATCH 3/4] :heavy_plus_sign: Add csv and serde dependencies --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index f1f912e..4c14ac8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,5 @@ edition = "2021" [dependencies] clap = { version = "4.5.0", features = ["derive"] } +csv = "1.3.0" +serde = { version = "1.0.196", features = ["derive"] } From 79ea7b76b2019e76072336ba5999b1aac4f4a1a2 Mon Sep 17 00:00:00 2001 From: Matisse Callewaert Date: Fri, 16 Feb 2024 10:59:37 +0100 Subject: [PATCH 4/4] :sparkles: Add csv parser --- src/args.rs | 11 +- src/main.rs | 73 ++++++++++- src/parsers/csv_parser.rs | 63 +++++++++ src/parsers/mod.rs | 2 + src/parsers/parser.rs | 30 +++++ src/records/cic_record.rs | 269 ++++++++++++++++++++++++++++++++++++++ src/records/mod.rs | 2 + src/records/print.rs | 3 + 8 files changed, 442 insertions(+), 11 deletions(-) create mode 100644 src/parsers/csv_parser.rs create mode 100644 src/parsers/mod.rs create mode 100644 src/parsers/parser.rs create mode 100644 src/records/cic_record.rs create mode 100644 src/records/mod.rs create mode 100644 src/records/print.rs diff --git a/src/args.rs b/src/args.rs index ccfb790..be34910 100644 --- a/src/args.rs +++ b/src/args.rs @@ -1,13 +1,10 @@ -use clap:: { - Parser, - Subcommand, -}; +use clap::{Parser, Subcommand}; #[derive(Debug, Parser)] #[clap(author, version, about)] pub struct Cli { #[clap(subcommand)] - command: Commands, + pub command: Commands, } #[derive(Debug, Subcommand)] @@ -47,7 +44,7 @@ pub enum Dataset { /// UNSW-NB15 from UNSW Sydney UnswNb15, - + /// UNSW-NB15 without contaminant features from Laurens D'Hooge UnswNb15Ld, -} \ No newline at end of file +} diff --git a/src/main.rs b/src/main.rs index 39c5f25..8f8d540 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,75 @@ mod args; +mod parsers; +mod records; -use args::Cli; +use core::panic; + +use args::{Cli, Commands, Dataset}; use clap::Parser; +use crate::{ + parsers::csv_parser::CsvParser, + records::{cic_record::CicRecord, print::Print}, +}; + fn main() { - let args = Cli::parse(); + let cli = Cli::parse(); + + match cli.command { + Commands::Realtime => { + handle_realtime(); + } + Commands::Dataset { dataset, path } => { + handle_dataset(dataset, &path); + } + } +} + +fn handle_realtime() { + println!("Real-time feature extraction"); +} + +fn handle_dataset(dataset: Dataset, path: &str) { + println!( + "Dataset feature extraction for {:?} from path: {}", + dataset, path + ); + + match dataset { + Dataset::CicIds2017 => { + if path.ends_with(".csv") { + let parser = CsvParser; + + match parser.parse::(path) { + Ok(records) => { + for record in records { + match record { + Ok(record) => { + record.print(); + } + Err(err) => { + // TODO: Will we output to stderr, drop the record or use default values? + eprintln!("Error: {:?}", err); + } + } + } + } + Err(err) => { + eprintln!("Error: {:?}", err); + } + } + } else if path.ends_with(".pcap") { + panic!("This file format is not supported yet..."); - print!("{:?}\n", args); -} \ No newline at end of file + } else if path.ends_with(".parquet") { + panic!("This file format is not supported yet..."); + + } else { + panic!("This file format is not supported..."); + } + } + _ => { + panic!("This is not implemented yet..."); + } + } +} diff --git a/src/parsers/csv_parser.rs b/src/parsers/csv_parser.rs new file mode 100644 index 0000000..1c889e5 --- /dev/null +++ b/src/parsers/csv_parser.rs @@ -0,0 +1,63 @@ +use super::parser::ReadError; +use csv::{ReaderBuilder, StringRecord, Trim}; +use serde::de::DeserializeOwned; +use std::collections::HashSet; +use std::fs::File; + +pub struct CsvParser; + +fn preprocess_headers(headers: &StringRecord) -> (StringRecord, HashSet) { + let mut unique_headers = StringRecord::new(); + let mut seen = HashSet::new(); + let mut indices = HashSet::new(); + + for (index, header) in headers.iter().enumerate() { + let trimmed_header = header.trim(); + if !seen.contains(trimmed_header) { + unique_headers.push_field(trimmed_header); + seen.insert(trimmed_header); + indices.insert(index); + } + } + + (unique_headers, indices) +} + +fn filter_record(record: &StringRecord, indices: &HashSet) -> StringRecord { + let mut filtered = StringRecord::new(); + for (index, field) in record.iter().enumerate() { + if indices.contains(&index) { + filtered.push_field(field); + } + } + filtered +} + +impl CsvParser { + pub fn parse( + &self, + file_path: &str, + ) -> Result>>, ReadError> + where + T: DeserializeOwned + 'static, + { + let file = File::open(file_path).map_err(ReadError::Io)?; + let mut rdr = ReaderBuilder::new().trim(Trim::All).from_reader(file); + + let headers = rdr.headers()?.clone(); + let (unique_headers, indices) = preprocess_headers(&headers); + + rdr.set_headers(unique_headers.clone()); + + let iter = rdr.into_records().map(move |result| { + result.map_err(ReadError::Csv).and_then(|record| { + let filtered_record = filter_record(&record, &indices); + + csv::StringRecord::deserialize(&filtered_record, Some(&unique_headers)) + .map_err(ReadError::Csv) + }) + }); + + Ok(Box::new(iter)) + } +} diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs new file mode 100644 index 0000000..5a33d09 --- /dev/null +++ b/src/parsers/mod.rs @@ -0,0 +1,2 @@ +pub mod csv_parser; +pub mod parser; diff --git a/src/parsers/parser.rs b/src/parsers/parser.rs new file mode 100644 index 0000000..a183cf3 --- /dev/null +++ b/src/parsers/parser.rs @@ -0,0 +1,30 @@ +use csv::Error as CsvError; +use serde::de::DeserializeOwned; +use std::io; + +#[derive(Debug)] +pub enum ReadError { + Io(io::Error), + Csv(CsvError), +} + +impl From for ReadError { + fn from(err: io::Error) -> Self { + ReadError::Io(err) + } +} + +impl From for ReadError { + fn from(err: CsvError) -> Self { + ReadError::Csv(err) + } +} + +pub trait Parser { + fn parse( + &self, + file_path: &str, + ) -> Result>>, ReadError> + where + T: DeserializeOwned + 'static; +} diff --git a/src/records/cic_record.rs b/src/records/cic_record.rs new file mode 100644 index 0000000..708f8f6 --- /dev/null +++ b/src/records/cic_record.rs @@ -0,0 +1,269 @@ +use super::print::Print; +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +pub struct CicRecord { + #[serde(rename = "Flow ID")] + pub flow_id: String, + #[serde(rename = "Source IP")] + pub src_ip: String, + #[serde(rename = "Source Port")] + pub src_port: u16, + #[serde(rename = "Destination IP")] + pub dst_ip: String, + #[serde(rename = "Destination Port")] + pub dst_port: u16, + #[serde(rename = "Protocol")] + pub protocol: u8, + #[serde(rename = "Timestamp")] + pub timestamp: String, + #[serde(rename = "Flow Duration")] + pub flow_duration: f64, + #[serde(rename = "Total Fwd Packets")] + pub tot_fwd_pkts: u32, + #[serde(rename = "Total Backward Packets")] + pub tot_bwd_pkts: u32, + #[serde(rename = "Total Length of Fwd Packets")] + pub totlen_fwd_pkts: f64, + #[serde(rename = "Total Length of Bwd Packets")] + pub totlen_bwd_pkts: f64, + #[serde(rename = "Fwd Packet Length Max")] + pub fwd_pkt_len_max: f64, + #[serde(rename = "Fwd Packet Length Min")] + pub fwd_pkt_len_min: f64, + #[serde(rename = "Fwd Packet Length Mean")] + pub fwd_pkt_len_mean: f64, + #[serde(rename = "Fwd Packet Length Std")] + pub fwd_pkt_len_std: f64, + #[serde(rename = "Bwd Packet Length Max")] + pub bwd_pkt_len_max: f64, + #[serde(rename = "Bwd Packet Length Min")] + pub bwd_pkt_len_min: f64, + #[serde(rename = "Bwd Packet Length Mean")] + pub bwd_pkt_len_mean: f64, + #[serde(rename = "Bwd Packet Length Std")] + pub bwd_pkt_len_std: f64, + #[serde(rename = "Flow Bytes/s")] + pub flow_bytes_s: f64, + #[serde(rename = "Flow Packets/s")] + pub flow_packets_s: f64, + #[serde(rename = "Flow IAT Mean")] + pub flow_iat_mean: f64, + #[serde(rename = "Flow IAT Std")] + pub flow_iat_std: f64, + #[serde(rename = "Flow IAT Max")] + pub flow_iat_max: f64, + #[serde(rename = "Flow IAT Min")] + pub flow_iat_min: f64, + #[serde(rename = "Fwd IAT Total")] + pub fwd_iat_total: f64, + #[serde(rename = "Fwd IAT Mean")] + pub fwd_iat_mean: f64, + #[serde(rename = "Fwd IAT Std")] + pub fwd_iat_std: f64, + #[serde(rename = "Fwd IAT Max")] + pub fwd_iat_max: f64, + #[serde(rename = "Fwd IAT Min")] + pub fwd_iat_min: f64, + #[serde(rename = "Bwd IAT Total")] + pub bwd_iat_total: f64, + #[serde(rename = "Bwd IAT Mean")] + pub bwd_iat_mean: f64, + #[serde(rename = "Bwd IAT Std")] + pub bwd_iat_std: f64, + #[serde(rename = "Bwd IAT Max")] + pub bwd_iat_max: f64, + #[serde(rename = "Bwd IAT Min")] + pub bwd_iat_min: f64, + #[serde(rename = "Fwd PSH Flags")] + pub fwd_psh_flags: u32, + #[serde(rename = "Bwd PSH Flags")] + pub bwd_psh_flags: u32, + #[serde(rename = "Fwd URG Flags")] + pub fwd_urg_flags: u32, + #[serde(rename = "Bwd URG Flags")] + pub bwd_urg_flags: u32, + #[serde(rename = "Fwd Header Length")] + pub fwd_header_length: f64, + #[serde(rename = "Bwd Header Length")] + pub bwd_header_length: f64, + #[serde(rename = "Fwd Packets/s")] + pub fwd_packets_s: f64, + #[serde(rename = "Bwd Packets/s")] + pub bwd_packets_s: f64, + #[serde(rename = "Min Packet Length")] + pub min_packet_length: f64, + #[serde(rename = "Max Packet Length")] + pub max_packet_length: f64, + #[serde(rename = "Packet Length Mean")] + pub packet_length_mean: f64, + #[serde(rename = "Packet Length Std")] + pub packet_length_std: f64, + #[serde(rename = "Packet Length Variance")] + pub packet_length_variance: f64, + #[serde(rename = "FIN Flag Count")] + pub fin_flag_count: u32, + #[serde(rename = "SYN Flag Count")] + pub syn_flag_count: u32, + #[serde(rename = "RST Flag Count")] + pub rst_flag_count: u32, + #[serde(rename = "PSH Flag Count")] + pub psh_flag_count: u32, + #[serde(rename = "ACK Flag Count")] + pub ack_flag_count: u32, + #[serde(rename = "URG Flag Count")] + pub urg_flag_count: u32, + #[serde(rename = "CWE Flag Count")] + pub cwe_flag_count: u32, + #[serde(rename = "ECE Flag Count")] + pub ece_flag_count: u32, + #[serde(rename = "Down/Up Ratio")] + pub down_up_ratio: f64, + #[serde(rename = "Average Packet Size")] + pub average_packet_size: f64, + #[serde(rename = "Avg Fwd Segment Size")] + pub avg_fwd_segment_size: f64, + #[serde(rename = "Avg Bwd Segment Size")] + pub avg_bwd_segment_size: f64, + #[serde(rename = "Fwd Avg Bytes/Bulk")] + pub fwd_avg_bytes_bulk: u64, + #[serde(rename = "Fwd Avg Packets/Bulk")] + pub fwd_avg_packets_bulk: u64, + #[serde(rename = "Fwd Avg Bulk Rate")] + pub fwd_avg_bulk_rate: f64, + #[serde(rename = "Bwd Avg Bytes/Bulk")] + pub bwd_avg_bytes_bulk: u64, + #[serde(rename = "Bwd Avg Packets/Bulk")] + pub bwd_avg_packets_bulk: u64, + #[serde(rename = "Bwd Avg Bulk Rate")] + pub bwd_avg_bulk_rate: f64, + #[serde(rename = "Subflow Fwd Packets")] + pub subflow_fwd_packets: u32, + #[serde(rename = "Subflow Fwd Bytes")] + pub subflow_fwd_bytes: u64, + #[serde(rename = "Subflow Bwd Packets")] + pub subflow_bwd_packets: u32, + #[serde(rename = "Subflow Bwd Bytes")] + pub subflow_bwd_bytes: u64, + #[serde(rename = "Init_Win_bytes_forward")] + pub init_win_bytes_forward: i64, + #[serde(rename = "Init_Win_bytes_backward")] + pub init_win_bytes_backward: i64, + #[serde(rename = "act_data_pkt_fwd")] + pub act_data_pkt_fwd: f64, + #[serde(rename = "min_seg_size_forward")] + pub min_seg_size_forward: u32, + #[serde(rename = "Active Mean")] + pub active_mean: f64, + #[serde(rename = "Active Std")] + pub active_std: f64, + #[serde(rename = "Active Max")] + pub active_max: f64, + #[serde(rename = "Active Min")] + pub active_min: f64, + #[serde(rename = "Idle Mean")] + pub idle_mean: f64, + #[serde(rename = "Idle Std")] + pub idle_std: f64, + #[serde(rename = "Idle Max")] + pub idle_max: f64, + #[serde(rename = "Idle Min")] + pub idle_min: f64, + #[serde(rename = "Label")] + pub label: String, +} + +impl Print for CicRecord { + fn print(&self) { + println!( + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},\ + {},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},\ + {},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},\ + {},{},{},{},{},{},{},{},{}", + self.flow_id, + self.src_ip, + self.src_port, + self.dst_ip, + self.dst_port, + self.protocol, + self.timestamp, + self.flow_duration, + self.tot_fwd_pkts, + self.tot_bwd_pkts, + self.totlen_fwd_pkts, + self.totlen_bwd_pkts, + self.fwd_pkt_len_max, + self.fwd_pkt_len_min, + self.fwd_pkt_len_mean, + self.fwd_pkt_len_std, + self.bwd_pkt_len_max, + self.bwd_pkt_len_min, + self.bwd_pkt_len_mean, + self.bwd_pkt_len_std, + self.flow_bytes_s, + self.flow_packets_s, + self.flow_iat_mean, + self.flow_iat_std, + self.flow_iat_max, + self.flow_iat_min, + self.fwd_iat_total, + self.fwd_iat_mean, + self.fwd_iat_std, + self.fwd_iat_max, + self.fwd_iat_min, + self.bwd_iat_total, + self.bwd_iat_mean, + self.bwd_iat_std, + self.bwd_iat_max, + self.bwd_iat_min, + self.fwd_psh_flags, + self.bwd_psh_flags, + self.fwd_urg_flags, + self.bwd_urg_flags, + self.fwd_header_length, + self.bwd_header_length, + self.fwd_packets_s, + self.bwd_packets_s, + self.min_packet_length, + self.max_packet_length, + self.packet_length_mean, + self.packet_length_std, + self.packet_length_variance, + self.fin_flag_count, + self.syn_flag_count, + self.rst_flag_count, + self.psh_flag_count, + self.ack_flag_count, + self.urg_flag_count, + self.cwe_flag_count, + self.ece_flag_count, + self.down_up_ratio, + self.average_packet_size, + self.avg_fwd_segment_size, + self.avg_bwd_segment_size, + self.fwd_avg_bytes_bulk, + self.fwd_avg_packets_bulk, + self.fwd_avg_bulk_rate, + self.bwd_avg_bytes_bulk, + self.bwd_avg_packets_bulk, + self.bwd_avg_bulk_rate, + self.subflow_fwd_packets, + self.subflow_fwd_bytes, + self.subflow_bwd_packets, + self.subflow_bwd_bytes, + self.init_win_bytes_forward, + self.init_win_bytes_backward, + self.act_data_pkt_fwd, + self.min_seg_size_forward, + self.active_mean, + self.active_std, + self.active_max, + self.active_min, + self.idle_mean, + self.idle_std, + self.idle_max, + self.idle_min, + self.label + ); + } +} diff --git a/src/records/mod.rs b/src/records/mod.rs new file mode 100644 index 0000000..2d8ef36 --- /dev/null +++ b/src/records/mod.rs @@ -0,0 +1,2 @@ +pub mod cic_record; +pub mod print; diff --git a/src/records/print.rs b/src/records/print.rs new file mode 100644 index 0000000..aca5be9 --- /dev/null +++ b/src/records/print.rs @@ -0,0 +1,3 @@ +pub trait Print { + fn print(&self); +}