-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from matissecallewaert/feature/13-csv-parser
Feature/13 csv parser
- Loading branch information
Showing
10 changed files
with
496 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
max_width = 100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
use clap::{Parser, Subcommand}; | ||
|
||
#[derive(Debug, Parser)] | ||
#[clap(author, version, about)] | ||
pub struct Cli { | ||
#[clap(subcommand)] | ||
pub command: Commands, | ||
} | ||
|
||
#[derive(Debug, Subcommand)] | ||
pub enum Commands { | ||
/// Real-time feature extraction | ||
Realtime, | ||
|
||
/// Feature extraction from a dataset | ||
Dataset { | ||
#[clap(value_enum)] | ||
dataset: Dataset, | ||
|
||
/// The relative path to the dataset | ||
path: String, | ||
}, | ||
} | ||
|
||
#[derive(clap::ValueEnum, Clone, Debug)] | ||
pub enum Dataset { | ||
/// CIC-IDS2017 from the Canadian Institute for Cybersecurity | ||
CicIds2017, | ||
|
||
/// CSE-CIC-IDS2018 from the Canadian Institute for Cybersecurity | ||
CseCicIds2018, | ||
|
||
/// CIC-DDoS2019 from the Canadian Institute for Cybersecurity | ||
CicDdos2019, | ||
|
||
/// CIC-IDS-Collection from Laurens D'Hooge | ||
CicIdsCollection, | ||
|
||
/// CTU-13 from CTU university of the Czech Republic | ||
Ctu13, | ||
|
||
/// CTU-13 without contaminant features from Laurens D'Hooge | ||
Ctu13Ld, | ||
|
||
/// UNSW-NB15 from UNSW Sydney | ||
UnswNb15, | ||
|
||
/// UNSW-NB15 without contaminant features from Laurens D'Hooge | ||
UnswNb15Ld, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,75 @@ | ||
mod args; | ||
mod parsers; | ||
mod records; | ||
|
||
use core::panic; | ||
|
||
use args::{Cli, Commands, Dataset}; | ||
use clap::Parser; | ||
|
||
use crate::{ | ||
parsers::csv_parser::CsvParser, | ||
records::{cic_record::CicRecord, print::Print}, | ||
}; | ||
|
||
fn main() { | ||
println!("Hello, world!"); | ||
let cli = Cli::parse(); | ||
|
||
match cli.command { | ||
Commands::Realtime => { | ||
handle_realtime(); | ||
} | ||
Commands::Dataset { dataset, path } => { | ||
handle_dataset(dataset, &path); | ||
} | ||
} | ||
} | ||
|
||
fn handle_realtime() { | ||
println!("Real-time feature extraction"); | ||
} | ||
|
||
fn handle_dataset(dataset: Dataset, path: &str) { | ||
println!( | ||
"Dataset feature extraction for {:?} from path: {}", | ||
dataset, path | ||
); | ||
|
||
match dataset { | ||
Dataset::CicIds2017 => { | ||
if path.ends_with(".csv") { | ||
let parser = CsvParser; | ||
|
||
match parser.parse::<CicRecord>(path) { | ||
Ok(records) => { | ||
for record in records { | ||
match record { | ||
Ok(record) => { | ||
record.print(); | ||
} | ||
Err(err) => { | ||
// TODO: Will we output to stderr, drop the record or use default values? | ||
eprintln!("Error: {:?}", err); | ||
} | ||
} | ||
} | ||
} | ||
Err(err) => { | ||
eprintln!("Error: {:?}", err); | ||
} | ||
} | ||
} else if path.ends_with(".pcap") { | ||
panic!("This file format is not supported yet..."); | ||
|
||
} else if path.ends_with(".parquet") { | ||
panic!("This file format is not supported yet..."); | ||
|
||
} else { | ||
panic!("This file format is not supported..."); | ||
} | ||
} | ||
_ => { | ||
panic!("This is not implemented yet..."); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
use super::parser::ReadError; | ||
use csv::{ReaderBuilder, StringRecord, Trim}; | ||
use serde::de::DeserializeOwned; | ||
use std::collections::HashSet; | ||
use std::fs::File; | ||
|
||
pub struct CsvParser; | ||
|
||
fn preprocess_headers(headers: &StringRecord) -> (StringRecord, HashSet<usize>) { | ||
let mut unique_headers = StringRecord::new(); | ||
let mut seen = HashSet::new(); | ||
let mut indices = HashSet::new(); | ||
|
||
for (index, header) in headers.iter().enumerate() { | ||
let trimmed_header = header.trim(); | ||
if !seen.contains(trimmed_header) { | ||
unique_headers.push_field(trimmed_header); | ||
seen.insert(trimmed_header); | ||
indices.insert(index); | ||
} | ||
} | ||
|
||
(unique_headers, indices) | ||
} | ||
|
||
fn filter_record(record: &StringRecord, indices: &HashSet<usize>) -> StringRecord { | ||
let mut filtered = StringRecord::new(); | ||
for (index, field) in record.iter().enumerate() { | ||
if indices.contains(&index) { | ||
filtered.push_field(field); | ||
} | ||
} | ||
filtered | ||
} | ||
|
||
impl CsvParser { | ||
pub fn parse<T>( | ||
&self, | ||
file_path: &str, | ||
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError> | ||
where | ||
T: DeserializeOwned + 'static, | ||
{ | ||
let file = File::open(file_path).map_err(ReadError::Io)?; | ||
let mut rdr = ReaderBuilder::new().trim(Trim::All).from_reader(file); | ||
|
||
let headers = rdr.headers()?.clone(); | ||
let (unique_headers, indices) = preprocess_headers(&headers); | ||
|
||
rdr.set_headers(unique_headers.clone()); | ||
|
||
let iter = rdr.into_records().map(move |result| { | ||
result.map_err(ReadError::Csv).and_then(|record| { | ||
let filtered_record = filter_record(&record, &indices); | ||
|
||
csv::StringRecord::deserialize(&filtered_record, Some(&unique_headers)) | ||
.map_err(ReadError::Csv) | ||
}) | ||
}); | ||
|
||
Ok(Box::new(iter)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pub mod csv_parser; | ||
pub mod parser; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
use csv::Error as CsvError; | ||
use serde::de::DeserializeOwned; | ||
use std::io; | ||
|
||
#[derive(Debug)] | ||
pub enum ReadError { | ||
Io(io::Error), | ||
Csv(CsvError), | ||
} | ||
|
||
impl From<io::Error> for ReadError { | ||
fn from(err: io::Error) -> Self { | ||
ReadError::Io(err) | ||
} | ||
} | ||
|
||
impl From<CsvError> for ReadError { | ||
fn from(err: CsvError) -> Self { | ||
ReadError::Csv(err) | ||
} | ||
} | ||
|
||
pub trait Parser { | ||
fn parse<T>( | ||
&self, | ||
file_path: &str, | ||
) -> Result<Box<dyn Iterator<Item = Result<T, ReadError>>>, ReadError> | ||
where | ||
T: DeserializeOwned + 'static; | ||
} |
Oops, something went wrong.