Skip to content

Commit

Permalink
[FEAT] Ellipsize glob scan paths (resolves #2709)
Browse files Browse the repository at this point in the history
  • Loading branch information
anmolsingh20 committed Sep 9, 2024
1 parent 7fe3dbc commit deb577f
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
21 changes: 20 additions & 1 deletion src/daft-scan/src/glob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,28 @@ impl ScanOperator for GlobScanOperator {
}

fn multiline_display(&self) -> Vec<String> {
let condensed_glob_paths = if self.glob_paths.len() <= 7 {
self.glob_paths.join(", ")
} else {
let first_three: Vec<String> = self.glob_paths.iter().take(3).cloned().collect();
let last_three: Vec<String> = self
.glob_paths
.iter()
.skip(self.glob_paths.len() - 3)
.cloned()
.collect();

let mut result = first_three.join(", ");
result.push_str(", ...");
result.push_str(", ");
result.push_str(&last_three.join(", "));

result
};

let mut lines = vec![
"GlobScanOperator".to_string(),
format!("Glob paths = [{}]", self.glob_paths.join(", ")),
format!("Glob paths = [{}]", condensed_glob_paths),
];
lines.extend(self.file_format_config.multiline_display());
lines.extend(self.storage_config.multiline_display());
Expand Down
41 changes: 40 additions & 1 deletion src/daft-scan/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -967,8 +967,9 @@ mod test {

use crate::{
file_format::{FileFormatConfig, ParquetSourceConfig},
glob::GlobScanOperator,
storage_config::{NativeStorageConfig, StorageConfig},
DataSource, Pushdowns, ScanTask,
DataSource, Pushdowns, ScanOperator, ScanTask,
};

fn make_scan_task(num_sources: usize) -> ScanTask {
Expand Down Expand Up @@ -1003,6 +1004,44 @@ mod test {
)
}

fn make_glob_scan_operator(num_sources: usize) -> GlobScanOperator {
let file_format_config: FileFormatConfig = FileFormatConfig::Parquet(ParquetSourceConfig {
coerce_int96_timestamp_unit: TimeUnit::Seconds,
field_id_mapping: None,
row_groups: None,
chunk_size: None,
});

let mut sources: Vec<String> = Vec::new();

for _ in 0..num_sources {
sources.push(format!("../../tests/assets/parquet-data/mvp.parquet"));
}

let glob_paths: Vec<&str> = sources.iter().map(|s| s.as_str()).collect();

let glob_scan_operator: GlobScanOperator = GlobScanOperator::try_new(
&glob_paths,
Arc::new(file_format_config),
Arc::new(StorageConfig::Native(Arc::new(
NativeStorageConfig::new_internal(false, None),
))),
false,
Some(Arc::new(Schema::empty())),
)
.unwrap();

glob_scan_operator
}

#[test]
fn test_glob_display_condenses() -> DaftResult<()> {
let glob_scan_operator: GlobScanOperator = make_glob_scan_operator(8);
let condensed_glob_paths: Vec<String> = glob_scan_operator.multiline_display();
assert_eq!(condensed_glob_paths[1], "Glob paths = [../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ..., ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet, ../../tests/assets/parquet-data/mvp.parquet]");
Ok(())
}

#[test]
fn test_display_condenses() -> DaftResult<()> {
let scan_task = make_scan_task(7);
Expand Down

0 comments on commit deb577f

Please sign in to comment.