From e973e30e0838486b248290ab9381b0df39d02f24 Mon Sep 17 00:00:00 2001 From: sathis Date: Tue, 27 Apr 2021 03:07:10 +0530 Subject: [PATCH] Deduplicate README.md (#79) * Deduplicate README.md * Remove CONTRIBUTING.md as it is no longer relevant Co-authored-by: Sathis Kumar --- CONTRIBUTING.md | 77 ----- datafusion/DEVELOPERS.md => DEVELOPERS.md | 28 +- README.md | 18 +- datafusion/Cargo.toml | 1 + datafusion/README.md | 358 ---------------------- datafusion/src/lib.rs | 2 +- 6 files changed, 26 insertions(+), 458 deletions(-) delete mode 100644 CONTRIBUTING.md rename datafusion/DEVELOPERS.md => DEVELOPERS.md (69%) delete mode 100644 datafusion/README.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 3e636d9cd2fe..000000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,77 +0,0 @@ - - -# How to contribute to Apache Arrow - -## Did you find a bug? - -The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have -to first create an account on the -[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server -hosts bugs and issues for multiple Apache projects. The JIRA project name -for Arrow is "ARROW". - -To be assigned to an issue, ask an Arrow JIRA admin to go to -[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles), -click "Add users to a role," and add you to the "Contributor" role. Most -committers are authorized to do this; if you're a committer and aren't -able to load that project admin page, have someone else add you to the -necessary role. - -Before you create a new bug entry, we recommend you first -[search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues) -among existing Arrow issues. - -When you create a new JIRA entry, please don't forget to fill the "Component" -field. Arrow has many subcomponents and this helps triaging and filtering -tremendously. Also, we conventionally prefix the issue title with the component -name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make -lists more easy to navigate, and we'd be grateful if you did the same. - -## Did you write a patch that fixes a bug or brings an improvement? - -First create a JIRA entry as described above. Then, submit your changes -as a GitHub Pull Request. We'll ask you to prefix the pull request title -with the JIRA issue number and the component name in brackets. -(for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()"). -Respecting this convention makes it easier for us to process the backlog -of submitted Pull Requests. - -### Minor Fixes - -Any functionality change should have a JIRA opened. For minor changes that -affect documentation, you do not need to open up a JIRA. Instead you can -prefix the title of your PR with "MINOR: " if meets the following guidelines: - -* Grammar, usage and spelling fixes that affect no more than 2 files -* Documentation updates affecting no more than 2 files and not more - than 500 words. - -## Do you want to propose a significant new feature or an important refactoring? - -We ask that all discussions about major changes in the codebase happen -publicly on the [arrow-dev mailing-list](https://mail-archives.apache.org/mod_mbox/arrow-dev/). - -## Do you have questions about the source code, the build procedure or the development process? - -You can also ask on the mailing-list, see above. - -## Further information - -Please read our [development documentation](https://arrow.apache.org/docs/developers/contributing.html). diff --git a/datafusion/DEVELOPERS.md b/DEVELOPERS.md similarity index 69% rename from datafusion/DEVELOPERS.md rename to DEVELOPERS.md index aa80cb71d3b9..1dc9304651c8 100644 --- a/datafusion/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -35,21 +35,21 @@ DataFusion is written in Rust and it uses a standard rust toolkit: Below is a checklist of what you need to do to add a new scalar function to DataFusion: * Add the actual implementation of the function: - * [here](src/physical_plan/string_expressions.rs) for string functions - * [here](src/physical_plan/math_expressions.rs) for math functions - * [here](src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](src/physical_plan) for other functions -* In [src/physical_plan/functions](src/physical_plan/functions.rs), add: + * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + * create a new module [here](datafusion/src/physical_plan) for other functions +* In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: * a new variant to `BuiltinScalarFunction` * a new entry to `FromStr` with the name of the function as called by SQL * a new line in `return_type` with the expected return type of the function, given an incoming type * a new line in `signature` with the signature of the function (number and types of its arguments) * a new line in `create_physical_expr` mapping the built-in to the implementation * tests to the function. -* In [tests/sql.rs](tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. -* In [src/logical_plan/expr](src/logical_plan/expr.rs), add: +* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +* In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: * a new entry of the `unary_scalar_expr!` macro for the new function. -* In [src/logical_plan/mod](src/logical_plan/mod.rs), add: +* In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: * a new entry in the `pub use expr::{}` set. ## How to add a new aggregate function @@ -57,18 +57,18 @@ Below is a checklist of what you need to do to add a new scalar function to Data Below is a checklist of what you need to do to add a new aggregate function to DataFusion: * Add the actual implementation of an `Accumulator` and `AggregateExpr`: - * [here](src/physical_plan/string_expressions.rs) for string functions - * [here](src/physical_plan/math_expressions.rs) for math functions - * [here](src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](src/physical_plan) for other functions -* In [src/physical_plan/aggregates](src/physical_plan/aggregates.rs), add: + * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + * create a new module [here](datafusion/src/physical_plan) for other functions +* In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: * a new variant to `BuiltinAggregateFunction` * a new entry to `FromStr` with the name of the function as called by SQL * a new line in `return_type` with the expected return type of the function, given an incoming type * a new line in `signature` with the signature of the function (number and types of its arguments) * a new line in `create_aggregate_expr` mapping the built-in to the implementation * tests to the function. -* In [tests/sql.rs](tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. ## How to display plans graphically diff --git a/README.md b/README.md index 9e6b7a2a78b5..f6ef7d176686 100644 --- a/README.md +++ b/README.md @@ -97,8 +97,8 @@ async fn main() -> datafusion::error::Result<()> { let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(100)?; + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(100)?; // execute and print results let results: Vec = df.collect().await?; @@ -141,11 +141,11 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] SQL Parser - [x] SQL Query Planner - [x] Query Optimizer - - [x] Constant folding - - [x] Join Reordering - - [x] Limit Pushdown - - [x] Projection push down - - [x] Predicate push down +- [x] Constant folding +- [x] Join Reordering +- [x] Limit Pushdown +- [x] Projection push down +- [x] Predicate push down - [x] Type coercion - [x] Parallel query execution @@ -213,7 +213,9 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [ ] MINUS - [x] Joins - [x] INNER JOIN - - [ ] CROSS JOIN + - [x] LEFT JOIN + - [x] RIGHT JOIN + - [x] CROSS JOIN - [ ] OUTER JOIN - [ ] Window diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index eaa7031794cf..3a7e857fe551 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -21,6 +21,7 @@ description = "DataFusion is an in-memory query engine that uses Apache Arrow as version = "4.0.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" +readme = "../README.md" authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow", "query", "sql" ] diff --git a/datafusion/README.md b/datafusion/README.md deleted file mode 100644 index ff0b26d7bf03..000000000000 --- a/datafusion/README.md +++ /dev/null @@ -1,358 +0,0 @@ - - -# DataFusion - - - -DataFusion is an extensible query execution framework, written in -Rust, that uses [Apache Arrow](https://arrow.apache.org) as its -in-memory format. - -DataFusion supports both an SQL and a DataFrame API for building -logical query plans as well as a query optimizer and execution engine -capable of parallel execution against partitioned data sources (CSV -and Parquet) using threads. - -## Use Cases - -DataFusion is used to create modern, fast and efficient data -pipelines, ETL processes, and database systems, which need the -performance of Rust and Apache Arrow and want to provide their users -the convenience of an SQL interface or a DataFrame API. - -## Why DataFusion? - -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. - -## Known Uses - -Here are some of the projects known to use DataFusion: - -* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform -* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube.js](https://github.com/cube-js/cube.js) -* [datafusion-python](https://pypi.org/project/datafusion) -* [delta-rs](https://github.com/delta-io/delta-rs) -* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database -* [ROAPI](https://github.com/roapi/roapi) - -(if you know of another project, please submit a PR to add a link!) - -## Example Usage - -Run a SQL query against data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let mut ctx = ExecutionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let mut ctx = ExecutionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; - - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(100)?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Both of these examples will produce - -```text -+---+--------+ -| a | MIN(b) | -+---+--------+ -| 1 | 2 | -+---+--------+ -``` - - - -## Using DataFusion as a library - -DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). - -To get started, add the following to your `Cargo.toml` file: - -```toml -[dependencies] -datafusion = "4.0.0-SNAPSHOT" -``` - -## Using DataFusion as a binary - -DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. - -# Status - -## General - -- [x] SQL Parser -- [x] SQL Query Planner -- [x] Query Optimizer - - [x] Constant folding - - [x] Join Reordering - - [x] Limit Pushdown - - [x] Projection push down - - [x] Predicate push down -- [x] Type coercion -- [x] Parallel query execution - -## SQL Support - -- [x] Projection -- [x] Filter (WHERE) -- [x] Filter post-aggregate (HAVING) -- [x] Limit -- [x] Aggregate -- [x] Common math functions -- [x] cast -- [x] try_cast -- Postgres compatible String functions - - [x] ascii - - [x] bit_length - - [x] btrim - - [x] char_length - - [x] character_length - - [x] chr - - [x] concat - - [x] concat_ws - - [x] initcap - - [x] left - - [x] length - - [x] lpad - - [x] ltrim - - [x] octet_length - - [x] regexp_replace - - [x] repeat - - [x] replace - - [x] reverse - - [x] right - - [x] rpad - - [x] rtrim - - [x] split_part - - [x] starts_with - - [x] strpos - - [x] substr - - [x] to_hex - - [x] translate - - [x] trim -- Miscellaneous/Boolean functions - - [x] nullif -- Common date/time functions - - [ ] Basic date functions - - [ ] Basic time functions - - [x] Basic timestamp functions -- nested functions - - [x] Array of columns -- [x] Schema Queries - - [x] SHOW TABLES - - [x] SHOW COLUMNS - - [x] information_schema.{tables, columns} - - [ ] information_schema other views -- [x] Sorting -- [ ] Nested types -- [ ] Lists -- [x] Subqueries -- [x] Common table expressions -- [ ] Set Operations - - [x] UNION ALL - - [ ] UNION - - [ ] INTERSECT - - [ ] MINUS -- [x] Joins - - [x] INNER JOIN - - [x] LEFT JOIN - - [x] RIGHT JOIN - - [x] CROSS JOIN - - [ ] OUTER JOIN -- [ ] Window - -## Data Sources - -- [x] CSV -- [x] Parquet primitive types -- [ ] Parquet nested types - - -## Extensibility - -DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: - -- [x] User Defined Functions (UDFs) -- [x] User Defined Aggregate Functions (UDAFs) -- [x] User Defined Table Source (`TableProvider`) for tables -- [x] User Defined `Optimizer` passes (plan rewrites) -- [x] User Defined `LogicalPlan` nodes -- [x] User Defined `ExecutionPlan` nodes - - -# Supported SQL - -This library currently supports many SQL constructs, including - -* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -* `SELECT ... FROM ...` together with any expression -* `ALIAS` to name an expression -* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -* `WHERE` to filter -* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` -* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - - -## Supported Functions - -DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. - -Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. - -## Schema Metadata / Information Schema Support - -DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. - -More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: - -```sql -> show tables; -+---------------+--------------------+------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | tables | VIEW | -+---------------+--------------------+------------+------------+ - -> select * from information_schema.tables; - -+---------------+--------------------+------------+--------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+--------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | TABLES | SYSTEM TABLE | -+---------------+--------------------+------------+--------------+ -``` - -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: - -```sql -> show columns from t; -+---------------+--------------+------------+-------------+-----------+-------------+ -| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | -+---------------+--------------+------------+-------------+-----------+-------------+ -| datafusion | public | t | a | Int32 | NO | -| datafusion | public | t | b | Utf8 | NO | -| datafusion | public | t | c | Float32 | NO | -+---------------+--------------+------------+-------------+-----------+-------------+ - -> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; -+------------+-------------+------------------+-------------+-----------+ -| table_name | column_name | ordinal_position | is_nullable | data_type | -+------------+-------------+------------------+-------------+-----------+ -| t | a | 0 | NO | Int32 | -| t | b | 1 | NO | Utf8 | -| t | c | 2 | NO | Float32 | -+------------+-------------+------------------+-------------+-----------+ -``` - - - -## Supported Data Types - -DataFusion uses Arrow, and thus the Arrow type system, for query -execution. The SQL types from -[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) -are mapped to Arrow types according to the following table - - -| SQL Data Type | Arrow DataType | -| --------------- | -------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | *Not yet supported* | -| `CLOB` | *Not yet supported* | -| `BINARY` | *Not yet supported* | -| `VARBINARY` | *Not yet supported* | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | *Not yet supported* | -| `REGCLASS` | *Not yet supported* | -| `TEXT` | *Not yet supported* | -| `BYTEA` | *Not yet supported* | -| `CUSTOM` | *Not yet supported* | -| `ARRAY` | *Not yet supported* | - - -# Architecture Overview - -There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. - -* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) -* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) - - -# Developer's guide - -Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 252d168114ad..e1d7368469b0 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -211,4 +211,4 @@ pub mod test; extern crate lazy_static; #[cfg(doctest)] -doc_comment::doctest!("../README.md", readme_example_test); +doc_comment::doctest!("../../README.md", readme_example_test);