Merge pull request #216 from Qrlew/impl_distinct_in_select

Impl distinct in select
Qrlew · Dec 8, 2023 · 93da6f0 · 93da6f0
2 parents bf3ea48 + ca42888
commit 93da6f0
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+## Added
+- implemented distinct in the select clause [#216](https://github.com/Qrlew/qrlew/issues/216)
+
 ## [0.5.4] - 2023-12-05
 - implemented `STD` and `VAR`aggregations in the dp rewritting [#205](https://github.com/Qrlew/qrlew/issues/205)
 - `Expr::filter_by_function`: if the filtered datatype cannot be determined, keep the original data [#209](https://github.com/Qrlew/qrlew/issues/209)

diff --git a/src/relation/mod.rs b/src/relation/mod.rs
@@ -500,7 +500,10 @@ impl Reduce {
                         aggregate_column
                             .super_image(&input_columns_data_type)
                             .unwrap(),
-                        if has_one_group && aggregate_column.aggregate() == &Aggregate::First {
+                        if aggregate_column.aggregate() == &Aggregate::First && (
+                            has_one_group ||
+                            input.schema().field(aggregate_column.column_name().unwrap()).unwrap().constraint() == Some(Constraint::Unique)
+                        ){
                             Some(Constraint::Unique)
                         } else {
                             None

diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs
@@ -663,6 +663,20 @@ impl Relation {
         }
     }
 
+    /// GROUP BY all the fields. This mimicks the sql `DISTINCT` in the
+    /// `SELECT` clause.
+    pub fn distinct(self) -> Relation {
+        let fields = self.schema()
+            .iter()
+            .map(|f| f.name().to_string())
+            .collect::<Vec<_>>();
+        Relation::reduce()
+            .input(self)
+            .with_iter(fields.iter().map(|f| (f, Expr::first(Expr::col(f)))))
+            .group_by_iter(fields.iter().map(|f| Expr::col(f)))
+            .build()
+    }
+
     /// Build a relation whose output fields are to the aggregations in `aggregates`
     /// applied on the UNIQUE values of the column `column` and grouped by the columns in `group_by`.
     /// If `grouping_by` is not empty, we order by the grouping expressions.
@@ -1924,4 +1938,55 @@ mod tests {
             names_aggs
         );
     }
+
+    #[test]
+    fn test_distinct() {
+        let table: Relation = Relation::table()
+            .name("table")
+            .schema(
+                Schema::builder()
+                    .with(("a", DataType::integer_range(1..=10)))
+                    .with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
+                    .with(("c", DataType::integer_range(5..=20)))
+                    .build(),
+            )
+            .build();
+
+        // Table
+        let distinct_relation = table.clone().distinct();
+        assert_eq!(distinct_relation.schema(), table.schema());
+        assert!(matches!(distinct_relation, Relation::Reduce(_)));
+        if let Relation::Reduce(red) = distinct_relation {
+            assert_eq!(red.group_by.len(), table.schema().len())
+        }
+
+        // Map
+        let relation: Relation = Relation::map()
+            .input(table.clone())
+            .with(expr!(a * b))
+            .with(("my_c", expr!(c)))
+            .build();
+        let distinct_relation = relation.clone().distinct();
+        assert_eq!(distinct_relation.schema(), relation.schema());
+        assert!(matches!(distinct_relation, Relation::Reduce(_)));
+        if let Relation::Reduce(red) = distinct_relation {
+            assert_eq!(red.group_by.len(), relation.schema().len())
+        }
+
+        // Reduce
+        let relation: Relation = Relation::reduce()
+            .input(table.clone())
+            .with(expr!(count(a)))
+            //.with_group_by_column("c")
+            .with(("twice_c", expr!(first(2*c))))
+            .group_by(expr!(c))
+            .build();
+        let distinct_relation = relation.clone().distinct();
+        distinct_relation.display_dot();
+        assert_eq!(distinct_relation.schema(), relation.schema());
+        assert!(matches!(distinct_relation, Relation::Reduce(_)));
+        if let Relation::Reduce(red) = distinct_relation {
+            assert_eq!(red.group_by.len(), relation.schema().len())
+        }
+    }
 }
diff --git a/src/sql/relation.rs b/src/sql/relation.rs
@@ -276,6 +276,7 @@ impl<'a> VisitedQueryRelations<'a> {
         group_by: &'a ast::GroupByExpr,
         from: Arc<Relation>,
         having: &'a Option<ast::Expr>,
+        distinct: &'a Option<ast::Distinct>,
     ) -> Result<Arc<Relation>> {
         // Collect all expressions with their aliases
         let mut named_exprs: Vec<(String, Expr)> = vec![];
@@ -372,6 +373,12 @@ impl<'a> VisitedQueryRelations<'a> {
                 .input(relation)
                 .build();
         }
+        if let Some(distinct) = distinct {
+            if matches!(distinct, ast::Distinct::On(_)) {
+                return Err(Error::other("DISTINCT IN is not supported"));
+            }
+            relation = relation.distinct()
+        }
         Ok(Arc::new(relation))
     }
 
@@ -393,9 +400,6 @@ impl<'a> VisitedQueryRelations<'a> {
             named_window,
             qualify,
         } = select;
-        if distinct.is_some() {
-            return Err(Error::other("DISTINCT is not supported"));
-        }
         if top.is_some() {
             return Err(Error::other("TOP is not supported"));
         }
@@ -428,6 +432,7 @@ impl<'a> VisitedQueryRelations<'a> {
             group_by,
             from,
             having,
+            distinct
         )?;
         Ok(RelationWithColumns::new(relation, columns))
     }
@@ -1111,13 +1116,13 @@ mod tests {
             &Hierarchy::from([(["schema", "table_1"], Arc::new(table_1))]),
         ))
         .unwrap();
+        relation.display_dot().unwrap();
         println!("relation = {relation}");
         assert_eq!(
             relation.data_type(),
             DataType::structured(vec![("my_sum", DataType::float_interval(0., 1000.))])
         );
 
-        //relation.display_dot().unwrap();
         let q = ast::Query::from(&relation);
         println!("query = {q}");
 
@@ -1200,4 +1205,34 @@ mod tests {
             DataType::structured(vec![("my_sum", DataType::float().try_empty().unwrap())])
         );
     }
+
+    #[test]
+    fn test_distinct_in_select() {
+        let query = parse("SELECT DISTINCT a, b FROM table_1;").unwrap();
+        let schema_1: Schema = vec![
+            ("a", DataType::integer_interval(0, 10)),
+            ("b", DataType::float_interval(0., 10.)),
+        ]
+        .into_iter()
+        .collect();
+        let table_1 = Relation::table()
+            .name("table_1")
+            .schema(schema_1.clone())
+            .size(100)
+            .build();
+        let relation = Relation::try_from(QueryWithRelations::new(
+            &query,
+            &Hierarchy::from([(["schema", "table_1"], Arc::new(table_1))]),
+        ))
+        .unwrap();
+        relation.display_dot().unwrap();
+        println!("relation = {relation}");
+        assert_eq!(
+            relation.data_type(),
+            DataType::structured(vec![
+                ("a", DataType::integer_interval(0, 10)),
+                ("b", DataType::float_interval(0., 10.)),
+            ])
+        );
+    }
 }
diff --git a/tests/integration.rs b/tests/integration.rs
@@ -93,6 +93,10 @@ const QUERIES: &[&str] = &[
     // Some string functions
     "SELECT UPPER(z) FROM table_2 LIMIT 5",
     "SELECT LOWER(z) FROM table_2 LIMIT 5",
+    // distinct
+    "SELECT DISTINCT COUNT(*) FROM table_1 GROUP BY d",
+    "SELECT DISTINCt c, d FROM table_1",
+    "SELECT c, COUNT(DISTINCT d) AS count_d, SUM(DISTINCT d) AS sum_d FROM table_1 GROUP BY c ORDER BY c"
 ];
 
 #[cfg(feature = "sqlite")]