Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/13 csv parser #2

Merged
merged 4 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.5.0", features = ["derive"] }
csv = "1.3.0"
serde = { version = "1.0.196", features = ["derive"] }
1 change: 1 addition & 0 deletions rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
max_width = 100
50 changes: 50 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use clap::{Parser, Subcommand};

#[derive(Debug, Parser)]
#[clap(author, version, about)]
pub struct Cli {
#[clap(subcommand)]
pub command: Commands,
}

#[derive(Debug, Subcommand)]
pub enum Commands {
/// Real-time feature extraction
Realtime,

/// Feature extraction from a dataset
Dataset {
#[clap(value_enum)]
dataset: Dataset,

/// The relative path to the dataset
path: String,
},
}

#[derive(clap::ValueEnum, Clone, Debug)]
pub enum Dataset {
/// CIC-IDS2017 from the Canadian Institute for Cybersecurity
CicIds2017,

/// CSE-CIC-IDS2018 from the Canadian Institute for Cybersecurity
CseCicIds2018,

/// CIC-DDoS2019 from the Canadian Institute for Cybersecurity
CicDdos2019,

/// CIC-IDS-Collection from Laurens D'Hooge
CicIdsCollection,

/// CTU-13 from CTU university of the Czech Republic
Ctu13,

/// CTU-13 without contaminant features from Laurens D'Hooge
Ctu13Ld,

/// UNSW-NB15 from UNSW Sydney
UnswNb15,

/// UNSW-NB15 without contaminant features from Laurens D'Hooge
UnswNb15Ld,
}
74 changes: 73 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,75 @@
mod args;
mod parsers;
mod records;

use core::panic;

use args::{Cli, Commands, Dataset};
use clap::Parser;

use crate::{
parsers::csv_parser::CsvParser,
records::{cic_record::CicRecord, print::Print},
};

fn main() {
println!("Hello, world!");
let cli = Cli::parse();

match cli.command {
Commands::Realtime => {
handle_realtime();
}
Commands::Dataset { dataset, path } => {
handle_dataset(dataset, &path);
}
}
}

fn handle_realtime() {
println!("Real-time feature extraction");
}

fn handle_dataset(dataset: Dataset, path: &str) {
println!(
"Dataset feature extraction for {:?} from path: {}",
dataset, path
);

match dataset {
Dataset::CicIds2017 => {
if path.ends_with(".csv") {
let parser = CsvParser;

match parser.parse::<CicRecord>(path) {
Ok(records) => {
for record in records {
match record {
Ok(record) => {
record.print();
}
Err(err) => {
// TODO: Will we output to stderr, drop the record or use default values?
eprintln!("Error: {:?}", err);
}
}
}
}
Err(err) => {
eprintln!("Error: {:?}", err);
}
}
} else if path.ends_with(".pcap") {
panic!("This file format is not supported yet...");

} else if path.ends_with(".parquet") {
panic!("This file format is not supported yet...");

} else {
panic!("This file format is not supported...");
}
}
_ => {
panic!("This is not implemented yet...");
}
}
}
63 changes: 63 additions & 0 deletions src/parsers/csv_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use super::parser::ReadError;
use csv::{ReaderBuilder, StringRecord, Trim};
use serde::de::DeserializeOwned;
use std::collections::HashSet;
use std::fs::File;

pub struct CsvParser;

fn preprocess_headers(headers: &StringRecord) -> (StringRecord, HashSet<usize>) {
let mut unique_headers = StringRecord::new();
let mut seen = HashSet::new();
let mut indices = HashSet::new();

for (index, header) in headers.iter().enumerate() {
let trimmed_header = header.trim();
if !seen.contains(trimmed_header) {
unique_headers.push_field(trimmed_header);
seen.insert(trimmed_header);
indices.insert(index);
}
}

(unique_headers, indices)
}

fn filter_record(record: &StringRecord, indices: &HashSet<usize>) -> StringRecord {
let mut filtered = StringRecord::new();
for (index, field) in record.iter().enumerate() {
if indices.contains(&index) {
filtered.push_field(field);
}
}
filtered
}

impl CsvParser {
pub fn parse<T>(
&self,
file_path: &str,
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
where
T: DeserializeOwned + 'static,
{
let file = File::open(file_path).map_err(ReadError::Io)?;
let mut rdr = ReaderBuilder::new().trim(Trim::All).from_reader(file);

let headers = rdr.headers()?.clone();
let (unique_headers, indices) = preprocess_headers(&headers);

rdr.set_headers(unique_headers.clone());

let iter = rdr.into_records().map(move |result| {
result.map_err(ReadError::Csv).and_then(|record| {
let filtered_record = filter_record(&record, &indices);

csv::StringRecord::deserialize(&filtered_record, Some(&unique_headers))
.map_err(ReadError::Csv)
})
});

Ok(Box::new(iter))
}
}
2 changes: 2 additions & 0 deletions src/parsers/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod csv_parser;
pub mod parser;
30 changes: 30 additions & 0 deletions src/parsers/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use csv::Error as CsvError;
use serde::de::DeserializeOwned;
use std::io;

#[derive(Debug)]
pub enum ReadError {
Io(io::Error),
Csv(CsvError),
}

impl From<io::Error> for ReadError {
fn from(err: io::Error) -> Self {
ReadError::Io(err)
}
}

impl From<CsvError> for ReadError {
fn from(err: CsvError) -> Self {
ReadError::Csv(err)
}
}

pub trait Parser {
fn parse<T>(
&self,
file_path: &str,
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError>
where
T: DeserializeOwned + 'static;
}
Loading
Loading