Skip to content

Commit

Permalink
Merge pull request #2 from matissecallewaert/feature/13-csv-parser
Browse files Browse the repository at this point in the history
Feature/13 csv parser
  • Loading branch information
matissecallewaert authored Feb 17, 2024
2 parents 455431e + 79ea7b7 commit b10802c
Show file tree
Hide file tree
Showing 10 changed files with 496 additions and 1 deletion.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.5.0", features = ["derive"] }
csv = "1.3.0"
serde = { version = "1.0.196", features = ["derive"] }
1 change: 1 addition & 0 deletions rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
max_width = 100
50 changes: 50 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use clap::{Parser, Subcommand};

#[derive(Debug, Parser)]
#[clap(author, version, about)]
pub struct Cli {
#[clap(subcommand)]
pub command: Commands,
}

#[derive(Debug, Subcommand)]
pub enum Commands {
/// Real-time feature extraction
Realtime,

/// Feature extraction from a dataset
Dataset {
#[clap(value_enum)]
dataset: Dataset,

/// The relative path to the dataset
path: String,
},
}

#[derive(clap::ValueEnum, Clone, Debug)]
pub enum Dataset {
/// CIC-IDS2017 from the Canadian Institute for Cybersecurity
CicIds2017,

/// CSE-CIC-IDS2018 from the Canadian Institute for Cybersecurity
CseCicIds2018,

/// CIC-DDoS2019 from the Canadian Institute for Cybersecurity
CicDdos2019,

/// CIC-IDS-Collection from Laurens D'Hooge
CicIdsCollection,

/// CTU-13 from CTU university of the Czech Republic
Ctu13,

/// CTU-13 without contaminant features from Laurens D'Hooge
Ctu13Ld,

/// UNSW-NB15 from UNSW Sydney
UnswNb15,

/// UNSW-NB15 without contaminant features from Laurens D'Hooge
UnswNb15Ld,
}
74 changes: 73 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,75 @@
mod args;
mod parsers;
mod records;

use core::panic;

use args::{Cli, Commands, Dataset};
use clap::Parser;

use crate::{
parsers::csv_parser::CsvParser,
records::{cic_record::CicRecord, print::Print},
};

fn main() {
println!("Hello, world!");
let cli = Cli::parse();

match cli.command {
Commands::Realtime => {
handle_realtime();
}
Commands::Dataset { dataset, path } => {
handle_dataset(dataset, &path);
}
}
}

fn handle_realtime() {
println!("Real-time feature extraction");
}

fn handle_dataset(dataset: Dataset, path: &str) {
println!(
"Dataset feature extraction for {:?} from path: {}",
dataset, path
);

match dataset {
Dataset::CicIds2017 => {
if path.ends_with(".csv") {
let parser = CsvParser;

match parser.parse::<CicRecord>(path) {
Ok(records) => {
for record in records {
match record {
Ok(record) => {
record.print();
}
Err(err) => {
// TODO: Will we output to stderr, drop the record or use default values?
eprintln!("Error: {:?}", err);
}
}
}
}
Err(err) => {
eprintln!("Error: {:?}", err);
}
}
} else if path.ends_with(".pcap") {
panic!("This file format is not supported yet...");

} else if path.ends_with(".parquet") {
panic!("This file format is not supported yet...");

} else {
panic!("This file format is not supported...");
}
}
_ => {
panic!("This is not implemented yet...");
}
}
}
63 changes: 63 additions & 0 deletions src/parsers/csv_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use super::parser::ReadError;
use csv::{ReaderBuilder, StringRecord, Trim};
use serde::de::DeserializeOwned;
use std::collections::HashSet;
use std::fs::File;

pub struct CsvParser;

fn preprocess_headers(headers: &StringRecord) -> (StringRecord, HashSet<usize>) {
let mut unique_headers = StringRecord::new();
let mut seen = HashSet::new();
let mut indices = HashSet::new();

for (index, header) in headers.iter().enumerate() {
let trimmed_header = header.trim();
if !seen.contains(trimmed_header) {
unique_headers.push_field(trimmed_header);
seen.insert(trimmed_header);
indices.insert(index);
}
}

(unique_headers, indices)
}

fn filter_record(record: &StringRecord, indices: &HashSet<usize>) -> StringRecord {
let mut filtered = StringRecord::new();
for (index, field) in record.iter().enumerate() {
if indices.contains(&index) {
filtered.push_field(field);
}
}
filtered
}

impl CsvParser {
pub fn parse<T>(
&self,
file_path: &str,
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
where
T: DeserializeOwned + 'static,
{
let file = File::open(file_path).map_err(ReadError::Io)?;
let mut rdr = ReaderBuilder::new().trim(Trim::All).from_reader(file);

let headers = rdr.headers()?.clone();
let (unique_headers, indices) = preprocess_headers(&headers);

rdr.set_headers(unique_headers.clone());

let iter = rdr.into_records().map(move |result| {
result.map_err(ReadError::Csv).and_then(|record| {
let filtered_record = filter_record(&record, &indices);

csv::StringRecord::deserialize(&filtered_record, Some(&unique_headers))
.map_err(ReadError::Csv)
})
});

Ok(Box::new(iter))
}
}
2 changes: 2 additions & 0 deletions src/parsers/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod csv_parser;
pub mod parser;
30 changes: 30 additions & 0 deletions src/parsers/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use csv::Error as CsvError;
use serde::de::DeserializeOwned;
use std::io;

#[derive(Debug)]
pub enum ReadError {
Io(io::Error),
Csv(CsvError),
}

impl From<io::Error> for ReadError {
fn from(err: io::Error) -> Self {
ReadError::Io(err)
}
}

impl From<CsvError> for ReadError {
fn from(err: CsvError) -> Self {
ReadError::Csv(err)
}
}

pub trait Parser {
fn parse<T>(
&self,
file_path: &str,
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
where
T: DeserializeOwned + 'static;
}
Loading

0 comments on commit b10802c

Please sign in to comment.