Merge pull request #2 from matissecallewaert/feature/13-csv-parser

Feature/13 csv parser
idlab-discover · Feb 17, 2024 · b10802c · b10802c
2 parents 455431e + 79ea7b7
commit b10802c
Show file tree

Hide file tree

Showing 10 changed files with 496 additions and 1 deletion.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+clap = { version = "4.5.0", features = ["derive"] }
+csv = "1.3.0"
+serde = { version = "1.0.196", features = ["derive"] }
diff --git a/rustfmt.toml b/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 100
diff --git a/src/args.rs b/src/args.rs
@@ -0,0 +1,50 @@
+use clap::{Parser, Subcommand};
+
+#[derive(Debug, Parser)]
+#[clap(author, version, about)]
+pub struct Cli {
+    #[clap(subcommand)]
+    pub command: Commands,
+}
+
+#[derive(Debug, Subcommand)]
+pub enum Commands {
+    /// Real-time feature extraction
+    Realtime,
+
+    /// Feature extraction from a dataset
+    Dataset {
+        #[clap(value_enum)]
+        dataset: Dataset,
+
+        /// The relative path to the dataset
+        path: String,
+    },
+}
+
+#[derive(clap::ValueEnum, Clone, Debug)]
+pub enum Dataset {
+    /// CIC-IDS2017 from the Canadian Institute for Cybersecurity
+    CicIds2017,
+
+    /// CSE-CIC-IDS2018 from the Canadian Institute for Cybersecurity
+    CseCicIds2018,
+
+    /// CIC-DDoS2019 from the Canadian Institute for Cybersecurity
+    CicDdos2019,
+
+    /// CIC-IDS-Collection from Laurens D'Hooge
+    CicIdsCollection,
+
+    /// CTU-13 from CTU university of the Czech Republic
+    Ctu13,
+
+    /// CTU-13 without contaminant features from Laurens D'Hooge
+    Ctu13Ld,
+
+    /// UNSW-NB15 from UNSW Sydney
+    UnswNb15,
+
+    /// UNSW-NB15 without contaminant features from Laurens D'Hooge
+    UnswNb15Ld,
+}
diff --git a/src/main.rs b/src/main.rs
@@ -1,3 +1,75 @@
+mod args;
+mod parsers;
+mod records;
+
+use core::panic;
+
+use args::{Cli, Commands, Dataset};
+use clap::Parser;
+
+use crate::{
+    parsers::csv_parser::CsvParser,
+    records::{cic_record::CicRecord, print::Print},
+};
+
 fn main() {
-    println!("Hello, world!");
+    let cli = Cli::parse();
+
+    match cli.command {
+        Commands::Realtime => {
+            handle_realtime();
+        }
+        Commands::Dataset { dataset, path } => {
+            handle_dataset(dataset, &path);
+        }
+    }
+}
+
+fn handle_realtime() {
+    println!("Real-time feature extraction");
+}
+
+fn handle_dataset(dataset: Dataset, path: &str) {
+    println!(
+        "Dataset feature extraction for {:?} from path: {}",
+        dataset, path
+    );
+
+    match dataset {
+        Dataset::CicIds2017 => {
+            if path.ends_with(".csv") {
+                let parser = CsvParser;
+
+                match parser.parse::<CicRecord>(path) {
+                    Ok(records) => {
+                        for record in records {
+                            match record {
+                                Ok(record) => {
+                                    record.print();
+                                }
+                                Err(err) => {
+                                    // TODO: Will we output to stderr, drop the record or use default values?
+                                    eprintln!("Error: {:?}", err);
+                                }
+                            }
+                        }
+                    }
+                    Err(err) => {
+                        eprintln!("Error: {:?}", err);
+                    }
+                }
+            } else if path.ends_with(".pcap") {
+                panic!("This file format is not supported yet...");
+
+            } else if path.ends_with(".parquet") {
+                panic!("This file format is not supported yet...");
+
+            } else {
+                panic!("This file format is not supported...");
+            }
+        }
+        _ => {
+            panic!("This is not implemented yet...");
+        }
+    }
 }
diff --git a/src/parsers/csv_parser.rs b/src/parsers/csv_parser.rs
@@ -0,0 +1,63 @@
+use super::parser::ReadError;
+use csv::{ReaderBuilder, StringRecord, Trim};
+use serde::de::DeserializeOwned;
+use std::collections::HashSet;
+use std::fs::File;
+
+pub struct CsvParser;
+
+fn preprocess_headers(headers: &StringRecord) -> (StringRecord, HashSet<usize>) {
+    let mut unique_headers = StringRecord::new();
+    let mut seen = HashSet::new();
+    let mut indices = HashSet::new();
+
+    for (index, header) in headers.iter().enumerate() {
+        let trimmed_header = header.trim();
+        if !seen.contains(trimmed_header) {
+            unique_headers.push_field(trimmed_header);
+            seen.insert(trimmed_header);
+            indices.insert(index);
+        }
+    }
+
+    (unique_headers, indices)
+}
+
+fn filter_record(record: &StringRecord, indices: &HashSet<usize>) -> StringRecord {
+    let mut filtered = StringRecord::new();
+    for (index, field) in record.iter().enumerate() {
+        if indices.contains(&index) {
+            filtered.push_field(field);
+        }
+    }
+    filtered
+}
+
+impl CsvParser {
+    pub fn parse<T>(
+        &self,
+        file_path: &str,
+    ) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
+    where
+        T: DeserializeOwned + 'static,
+    {
+        let file = File::open(file_path).map_err(ReadError::Io)?;
+        let mut rdr = ReaderBuilder::new().trim(Trim::All).from_reader(file);
+
+        let headers = rdr.headers()?.clone();
+        let (unique_headers, indices) = preprocess_headers(&headers);
+
+        rdr.set_headers(unique_headers.clone());
+
+        let iter = rdr.into_records().map(move |result| {
+            result.map_err(ReadError::Csv).and_then(|record| {
+                let filtered_record = filter_record(&record, &indices);
+
+                csv::StringRecord::deserialize(&filtered_record, Some(&unique_headers))
+                    .map_err(ReadError::Csv)
+            })
+        });
+
+        Ok(Box::new(iter))
+    }
+}
diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs
@@ -0,0 +1,2 @@
+pub mod csv_parser;
+pub mod parser;
diff --git a/src/parsers/parser.rs b/src/parsers/parser.rs
@@ -0,0 +1,30 @@
+use csv::Error as CsvError;
+use serde::de::DeserializeOwned;
+use std::io;
+
+#[derive(Debug)]
+pub enum ReadError {
+    Io(io::Error),
+    Csv(CsvError),
+}
+
+impl From<io::Error> for ReadError {
+    fn from(err: io::Error) -> Self {
+        ReadError::Io(err)
+    }
+}
+
+impl From<CsvError> for ReadError {
+    fn from(err: CsvError) -> Self {
+        ReadError::Csv(err)
+    }
+}
+
+pub trait Parser {
+    fn parse<T>(
+        &self,
+        file_path: &str,
+    ) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
+    where
+        T: DeserializeOwned + 'static;
+}