From ed4a17ecf94677c7c8a99848287b44b95f3dc458 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 31 May 2024 18:18:17 +0200 Subject: [PATCH] Writing a small description of each CS DSL function in the documentation --- .../dataframe/api/ColumnsSelectionDsl.kt | 2 +- .../dataframe/api/ColumnsSelectionDsl.kt | 2 +- docs/StardustDocs/topics/ColumnSelectors.md | 385 +++++++++++++++++- 3 files changed, 385 insertions(+), 4 deletions(-) diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt index 4fd814f775..da02f51ebc 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt @@ -123,7 +123,7 @@ public interface ColumnsSelectionDsl : /* SingleColumn> */ // except(), allExcept {}, allColsExcept {} AllExceptColumnsSelectionDsl, - // nameContains(""), colsNameContains(""), nameStartsWith(""), childrenNameEndsWith("") + // nameContains(""), colsNameContains(""), nameStartsWith(""), colsNameEndsWith("") ColumnNameFiltersColumnsSelectionDsl, // withoutNulls(), colsWithoutNulls() WithoutNullsColumnsSelectionDsl, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt index 2a7dfa5563..4295701cd3 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ColumnsSelectionDsl.kt @@ -97,7 +97,7 @@ public interface ColumnsSelectionDsl : /* SingleColumn> */ // except(), allExcept {}, allColsExcept {} AllExceptColumnsSelectionDsl, - // nameContains(""), colsNameContains(""), nameStartsWith(""), childrenNameEndsWith("") + // nameContains(""), colsNameContains(""), nameStartsWith(""), colsNameEndsWith("") ColumnNameFiltersColumnsSelectionDsl, // withoutNulls(), colsWithoutNulls() WithoutNullsColumnsSelectionDsl, diff --git a/docs/StardustDocs/topics/ColumnSelectors.md b/docs/StardustDocs/topics/ColumnSelectors.md index b6f63088ea..7e079fb105 100644 --- a/docs/StardustDocs/topics/ColumnSelectors.md +++ b/docs/StardustDocs/topics/ColumnSelectors.md @@ -2,7 +2,7 @@ -[`DataFrame`](DataFrame.md) provides a DSL for selecting an arbitrary set of columns. +[`DataFrame`](DataFrame.md) provides a DSL for selecting an arbitrary set of columns: the Columns Selection DSL. Column selectors are used in many operations: @@ -21,7 +21,7 @@ df.move { name.firstName and name.lastName }.after { city } -#### Full DSL Grammar: +#### Full DSL Grammar {collapsible="true"} **Definitions** @@ -39,6 +39,387 @@ df.move { name.firstName and name.lastName }.after { city } +#### Functions Overview {collapsible="true"} + +##### First (Col), Last (Col), Single (Col) {collapsible="true"} +`first {}`, `firstCol()`, `last {}`, `lastCol()`, `single {}`, `singleCol()` + +Returns the first, last, or single column from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` that adheres to the optional given condition. If no column adheres to the given condition, +`NoSuchElementException` is thrown. + +##### Col {collapsible="true"} +`col(name)`, `col(5)`, `this[5]` + +Creates a [ColumnAccessor](DataColumn.md#column-accessors) (or `SingleColumn`) for a column with the given +argument from the top-level or specified [column group](DataColumn.md#columngroup). The argument can be either an +index (`Int`) or a reference to a column (`String`, `ColumnPath`, `KProperty`, or `ColumnAccessor`; +any [AccessApi](apiLevels.md)). + +##### Value Col, Frame Col, Col Group {collapsible="true"} +`valueCol(name)`, `valueCol(5)`, `frameCol(name)`, `frameCol(5)`, `colGroup(name)`, `colGroup(5)` + +Creates a [ColumnAccessor](DataColumn.md#column-accessors) (or `SingleColumn`) for a +[value column](DataColumn.md#valuecolumn) / [frame column](DataColumn.md#framecolumn) / +[column group](DataColumn.md#columngroup) with the given argument from the top-level or +specified [column group](DataColumn.md#columngroup). The argument can be either an index (`Int`) or a reference +to a column (`String`, `ColumnPath`, `KProperty`, or `ColumnAccessor`; any [AccessApi](apiLevels.md)). +The functions can be both typed and untyped (in case you're supplying a column name, -path, or index). +These functions throw an `IllegalArgumentException` if the column found is not the right kind. + +##### Cols {collapsible="true"} +`cols {}`, `cols()`, `cols(colA, colB)`, `cols(1, 5)`, `cols(1..5)`, `[{}]`, `colSet[1, 3]` + +Creates a subset of columns (`ColumnSet`) from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet`. +You can use either a `ColumnFilter`, or any of the `vararg` overloads for any [AccessApi](apiLevels.md). +The function can be both typed and untyped (in case you're supplying a column name, -path, or index (range)). + +Note that you can also use the `[]` operator for most overloads of `cols` to achieve the same result. + +##### Range of Columns {collapsible="true"} +`colA.."colB"` + +Creates a `ColumnSet` containing all columns from `colA` to `colB` (inclusive) from the top-level. +Columns inside [column groups](DataColumn.md#columngroup) are also supported +(as long as they share the same direct parent), as well as any combination of [AccessApi](apiLevels.md). + +##### Value Columns, Frame Columns, Column Groups {collapsible="true"} +`valueCols {}`, `valueCols()`, `frameCols {}`, `frameCols()`, `colGroups {}`, `colGroups()` + +Creates a subset of columns (`ColumnSet`) from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` containing only [value columns](DataColumn.md#valuecolumn) / [frame columns](DataColumn.md#framecolumn) / +[column groups](DataColumn.md#columngroup) that adhere to the optional condition. + +##### Cols of Kind {collapsible="true"} +`colsOfKind(Value, Frame) {}`, `colsOfKind(Group, Frame)` + +Creates a subset of columns (`ColumnSet`) from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` containing only columns of the specified kind(s) that adhere to the optional condition. + +##### All (Cols) {collapsible="true"} +`all()`, `allCols()` + +Creates a `ColumnSet` containing all columns from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet`. This is the opposite of [`none()`](ColumnSelectors.md#none) and equivalent to +[`cols()`](ColumnSelectors.md#cols) without filter. +Note, on [column groups](DataColumn.md#columngroup), `all` is named `allCols` instead to avoid confusion. + +##### All (Cols) After, -Before, -From, -Up To {collapsible="true"} +`allAfter(colA)`, `allBefore(colA)`, `allColsFrom(colA)`, `allColsUpTo(colA)` + +Creates a `ColumnSet` containing a subset of columns from the top-level, +specified [column group](DataColumn.md#columngroup), or `ColumnSet`. +The subset includes: +- `all(Cols)Before(colA)`: All columns before the specified column, excluding that column. +- `all(Cols)After(colA)`: All columns after the specified column, excluding that column. +- `all(Cols)From(colA)`: All columns from the specified column, including that column. +- `all(Cols)UpTo(colA)`: All columns up to the specified column, including that column. + +NOTE: The `{}` overloads of these functions in the Plain DSL and on [column groups](DataColumn.md#columngroup) +are a `ColumnSelector` (relative to the receiver). +On `ColumnSets` they are a `ColumnFilter` instead. + +##### Cols at any Depth {collapsible="true"} +`colsAtAnyDepth {}`, `colsAtAnyDepth()` + +Creates a `ColumnSet` containing all columns from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` at any depth if they satisfy the optional given predicate. This means that columns (of all three kinds!) +nested inside [column groups](DataColumn.md#columngroup) are also included. +This function can also be followed by another `ColumnSet` filter-function like `colsOf<>()`, `single()`, +or `valueCols()`. + +**For example:** + +Depth-first search to a column containing the value "Alice": + +`df.select { colsAtAnyDepth().first { "Alice" in it.values() } }` + +The columns at any depth excluding the top-level: + +`df.select { colGroups().colsAtAnyDepth() }` + +All [value-](DataColumn.md#valuecolumn) and [frame columns](DataColumn.md#framecolumn) at any depth: + +`df.select { colsAtAnyDepth { !it.isColumnGroup } }` + +All value columns at any depth nested under a column group named "myColGroup": + +`df.select { myColGroup.colsAtAnyDepth().valueCols() }` + + +**Converting from deprecated syntax:** + +`dfs { condition }` -> `colsAtAnyDepth { condition }` + +`allDfs(includeGroups = false)` -> `colsAtAnyDepth { includeGroups || !it.isColumnGroup() }` + +`dfsOf { condition }` -> `colsAtAnyDepth().colsOf { condition }` + +`cols { condition }.recursively()` -> `colsAtAnyDepth { condition }` + +`first { condition }.rec()` -> `colsAtAnyDepth { condition }.first()` + +`all().recursively()` -> `colsAtAnyDepth()` + +##### Cols in Groups {collapsible="true"} +`colsInGroups {}`, `colsInGroups()` + +Creates a `ColumnSet` containing all columns that are nested in the [column groups](DataColumn.md#columngroup) at +the top-level, specified [column group](DataColumn.md#columngroup), or `ColumnSet` adhering to an optional predicate. +This is useful if you want to select all columns that are "one level down". + +This function used to be called `children()` in the past. + +**For example:** + +To get the columns inside all [column groups](DataColumn.md#columngroup) in a [dataframe](DataFrame.md), +instead of having to write: + +`df.select { colGroupA.cols() and colGroupB.cols() ... }` + +you can use: + +`df.select { colsInGroups() }` + +or with filter: + +`df.select { colsInGroups { "user" in it.name } }` + +Similarly, you can take the columns inside all [column groups](DataColumn.md#columngroup) in a `ColumnSet`: + +`df.select { colGroups { "my" in it.name }.colsInGroups() }` + +##### Take (Last) (Cols) (While) {collapsible="true"} +`take(5)`, `takeLastCols(2)`, `takeLastWhile {}`, `takeColsWhile {}`, + +Creates a `ColumnSet` containing the first / last `n` columns from the top-level, +specified [column group](DataColumn.md#columngroup), or `ColumnSet` or those that adhere to the given condition. +Note, to avoid ambiguity, `take` is called `takeCols` when called on a [column group](DataColumn.md#columngroup). + +##### Drop (Last) (Cols) (While) {collapsible="true"} +`drop(5)`, `dropLastCols(2)`, `dropLastWhile {}`, `dropColsWhile {}` + +Creates a `ColumnSet` without the first / last `n` columns from the top-level, +specified [column group](DataColumn.md#columngroup), or `ColumnSet` or those that adhere to the given condition. +Note, to avoid ambiguity, `drop` is called `dropCols` when called on a [column group](DataColumn.md#columngroup). + +##### Select from [Column Group](DataColumn.md#columngroup) {collapsible="true"} +`colGroupA.select {}`, `"colGroupA" {}` + +Creates a `ColumnSet` containing the columns selected by a `ColumnsSelector` relative to the specified +[column group](DataColumn.md#columngroup). In practice, this means you're opening a new selection DSL scope inside a +[column group](DataColumn.md#columngroup) and selecting columns from there. +The selected columns are referenced individually and "unpacked" from their parent +[column group](DataColumn.md#columngroup). + +**For example:** + +Select `myColGroup.someCol` and all `String` columns from `myColGroup`: + +`df.select { myColGroup.select { someCol and colsOf() } }` + +`df.select { "myGroupCol" { "colA" and expr("newCol") { colB + 1 } } }` + +`df.select { "pathTo"["myGroupCol"].select { "colA" and "colB" } }` + +`df.select { it["myGroupCol"].asColumnGroup()() { "colA" and "colB" } }` + +> Did you know? Because the Columns Selection DSL uses +> [`@DslMarker`](https://kotlinlang.org/docs/type-safe-builders.html#scope-control-dslmarker), outer scope leaking is +> prohibited! This means that you can't reference columns from the outer scope inside the `select {}` block. This +> ensures safety and prevents issues for when multiple columns exist with the same name. +> +> `userData.select { age and address.select { `~~`age`~~` } }` + + +##### (All) (Cols) Except {collapsible="true"} +`colSet.except()`, `allExcept {}`, `colGroupA.allColsExcept {}` + +Perform a selection of columns using a relative `ColumnsSelector` to exclude from the current selection. + +This function is best explained in parts: + +**On Column Sets:** `except {}` + +This function can be explained the easiest with a `ColumnSet`. +Let's say we want all `Int` columns apart from `age` and `height`. + +We can do: + +`df.select { colsOf() except (age and height) }` + +which will 'subtract' the `ColumnSet` created by `age and height` from the `ColumnSet` created by +[`colsOf()`](ColumnSelectors.md#cols-of). + +This operation can also be used to exclude columns that are originally in [column groups](DataColumn.md#columngroup). + +For instance, excluding `userData.age`: + +`df.select { colsAtAnyDepth { "a" in it.name() } except userData.age }` + +Note that the selection of columns to exclude from column sets is always done relative to the outer scope. +Use the [Extension Properties API](extensionPropertiesApi.md) to prevent scoping issues if possible. + +> Special case: If a column that needs to be removed appears multiple times in the `ColumnSet`, +> it is excepted each time it is encountered (including inside [Column Groups](DataColumn.md#columngroup)). +> You could say the receiver `ColumnSet` is [simplified](ColumnSelectors.md#simplify) before the operation is performed: +> +> `cols(a, a, a. b, a. b).except(a. b) == cols(a).except(a. b)` + + +**Directly in the DSL:** `allExcept {}` + +Instead of having to write `all() except { ... }` in the DSL, you can use `allExcept { ... }` to achieve the same result. + +This does the same but is a handy shorthand. + +For example: + +`df.select { allExcept { userData.age and height } }` + +**On [Column Groups](DataColumn.md#columngroup):** `allColsExcept {}` + +The variant of this function on [Column Groups](DataColumn.md#columngroup) is a bit different, as it changes the scope +to being relative to the [Column Groups](DataColumn.md#columngroup). +This is similar to the [`select`](ColumnSelectors.md#select-from-column-group) function. + +In other words: + +`df.select { myColGroup.allColsExcept { colA and colB } }` + +is shorthand for + +`df.select { myColGroup.select { allExcept { colA and colB } } }` + +or + +`df.select { myColGroup.allCols() except { myColGroup.colA and myColGroup.colB } }` + +Note the name change, similar to [`allCols`](ColumnSelectors.md#cols), this makes it clearer that you're selecting +columns inside the group, 'lifting' them out. + +**Experimental: Except on Column Group** + +Selects the current [column group](DataColumn.md#columngroup) itself, except for the specified columns. +This is different from `allColsExcept` in that it does not 'lift' the columns out of the group, +but instead selects the group itself. + +These all produce the same result: + +`df.select { colGroup exceptNew { col } }` + +`df.select { colGroup }.remove { colGroup.col }` + +`df.select { cols(colGroup) except colGroup.col }` + +> NOTE: This function is experimental and will definitely change in the future. +> It's named `exceptNew` until the deprecated `SingleColumn.except()` overloads are removed. +> Most likely, it'll be renamed to `except` afterward. +> Until then, it requires `@OptIn(ExperimentalExceptCsDsl::class)` to be used. + +##### Column Name Filters {collapsible="true"} +`nameContains()`, `colsNameContains()`, `nameStartsWith()`, `colsNameEndsWith()` + +Creates a `ColumnSet` containing columns from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` that have names that satisfy the given function. These functions accept a `String` as argument, as +well as an optional `ignoreCase` parameter. For the `nameContains` variant, you can also pass a `Regex` as an argument. +Note, on [column groups](DataColumn.md#columngroup), the functions have names starting with `cols` to avoid +ambiguity. + +##### (Cols) Without Nulls {collapsible="true"} +`withoutNulls()`, `colsWithoutNulls()` + +Creates a `ColumnSet` containing columns from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` that have no `null` values. This is a shorthand for `cols { !it.hasNulls() }`. +Note, to avoid ambiguity, `withoutNulls` is called `colsWithoutNulls` when called on a +[column group](DataColumn.md#columngroup). + +##### Distinct {collapsible="true"} +`colSet.distinct()` + +Returns a new `ColumnSet` from the specified `ColumnSet` containing only distinct columns (by path). +This is useful when you've selected the same column multiple times but only want it once. + +This does not cover the case where a column is selected individually and through its enclosing +[column group](DataColumn.md#columngroup). See [`simplify`](ColumnSelectors.md#simplify) for that. + +NOTE: This doesn't solve the `DuplicateColumnNamesException` if you've selected two columns with the same name. +For this, you'll need to [rename](ColumnSelectors.md#rename) one of the columns. + +##### None {collapsible="true"} +`none()` + +Creates an empty `ColumnSet`, essentially selecting no columns at all. +This is the opposite of [`all()`](ColumnSelectors.md#all-cols). + +This function mostly exists for completeness, but can be useful in some very specific cases. + +##### Cols Of {collapsible="true"} +`colsOf()`, `colsOf {}` + +Creates a `ColumnSet` containing columns from the top-level, specified [column group](DataColumn.md#columngroup), +or `ColumnSet` that are a subtype of the specified type `T` and adhere to the optional condition. + +##### Simplify {collapsible="true"} +`colSet.simplify()` + +Returns a new `ColumnSet` from the specified `ColumnSet` in 'simplified' form. +This function simplifies the structure of the `ColumnSet` by removing columns that are already present in +[column groups](DataColumn.md#columngroup), returning only these groups, +plus columns not belonging in any of the groups. + +In other words, this means that if a column in the `ColumnSet` is inside a [column group](DataColumn.md#columngroup) +in the `ColumnSet`, it will not be included in the result. + +It's useful in combination with [`colsAtAnyDepth {}`](ColumnSelectors.md#cols-at-any-depth), as that function can +create a `ColumnSet` containing both a column and the [column group](DataColumn.md#columngroup) it's in. + +In the past, was named `top()` and `roots()`, but these names have been deprecated. + +**For example:** + +`cols(a, a.b, d.c).simplify() == cols(a, d.c)` + +##### Filter {collapsible="true"} +`colSet.filter {}` + +Returns a new `ColumnSet` from the specified `ColumnSet` containing only columns that satisfy the given condition. +This function behaves the same as [`cols {}` and `[{}]`](ColumnSelectors.md#cols), but only exists on column sets. + +##### And {collapsible="true"} +`colSet and colB` + +Creates a `ColumnSet` containing the columns from both the left and right side of the function. This allows +you to combine selections or simply select multiple columns at once. + +Any combination of [AccessApi](apiLevels.md) can be used on either side of the `and` operator. + +Note, while you can write `col1 and col2 and col3...`, it may be more concise to use +[`cols(col1, col2, col3...)`](ColumnSelectors.md#cols) instead. The only downside is that you can't mix +[Access APIs](apiLevels.md) with that notation. + +##### Rename {collapsible="true"} +`colA named "colB"`, `colA into namedColAccessor` + +Renaming a column in the Columns Selection DSL is done by calling the infix functions +`named` or `into`. +They behave exactly the same, so it's up to contextual preference which one to use. +Any combination of [Access API](apiLevels.md) can be used to specify the column to rename +and which name should be used instead. + +##### Expr (Column Expression) {collapsible="true"} +`expr {}`, `expr("newCol") {}` + +Creates a temporary new column by defining an expression to fill up each row. +You may have come across this name before in the [Add DSL](add.md) or +[`toDataFrame {}` DSL](createDataFrame.md#todataframe). + +It's extremely useful when you want to create a new column based on existing columns for operations like +[`sortBy`](sortBy.md), [`groupBy`](groupBy.md), etc. + +#### Examples + **Select columns by name:**