From 5c335f0b1cbc56eb62b297a807a5acd649c66730 Mon Sep 17 00:00:00 2001 From: Sean Garborg Date: Mon, 24 Nov 2014 17:52:11 -0700 Subject: [PATCH 1/4] Deduplicate docs, move olds specs and prototypes to prototypes branch Closes #723 --- doc/other/03_design_details.md | 708 ----------------- doc/other/04_specification.md | 183 ----- doc/other/05_function_reference_guide.md | 469 ----------- doc/sections/00_table_of_contents.md | 11 - doc/sections/01_introduction.md | 101 --- doc/sections/02_getting_started.md | 134 ---- doc/sections/03_io.md | 96 --- doc/sections/04_subsets.md | 154 ---- doc/sections/05_joins_and_indexing.md | 54 -- doc/sections/06_split_apply_combine.md | 29 - doc/sections/07_reshaping_and_pivoting.md | 9 - doc/sections/08_sorting.md | 40 - doc/sections/10_formulas.md | 38 - doc/sections/11_pooling.md | 50 -- prototypes/benchmark_datastream.jl | 21 - prototypes/dataframe_blocks.jl | 627 --------------- prototypes/datastream.jl | 495 ------------ prototypes/doc/09_datastreams.md | 48 -- prototypes/indexing.jl | 344 -------- prototypes/namedarray.jl | 62 -- prototypes/test_dataframe_blocks.jl | 36 - prototypes/test_datastream.jl | 118 --- prototypes/test_indexing.jl | 39 - spec/FunctionReference.md | 465 ----------- spec/JuliaChanges.md | 43 - spec/basics.md | 10 - spec/show.md | 178 ----- sphinxdoc/other/design_details.rst | 781 ------------------- sphinxdoc/other/function_reference_guide.rst | 589 -------------- sphinxdoc/other/specification.rst | 214 ----- sphinxdoc/source/subsets.rst | 82 +- 31 files changed, 41 insertions(+), 6187 deletions(-) delete mode 100644 doc/other/03_design_details.md delete mode 100644 doc/other/04_specification.md delete mode 100644 doc/other/05_function_reference_guide.md delete mode 100644 doc/sections/00_table_of_contents.md delete mode 100644 doc/sections/01_introduction.md delete mode 100644 doc/sections/02_getting_started.md delete mode 100644 doc/sections/03_io.md delete mode 100644 doc/sections/04_subsets.md delete mode 100644 doc/sections/05_joins_and_indexing.md delete mode 100644 doc/sections/06_split_apply_combine.md delete mode 100644 doc/sections/07_reshaping_and_pivoting.md delete mode 100644 doc/sections/08_sorting.md delete mode 100644 doc/sections/10_formulas.md delete mode 100644 doc/sections/11_pooling.md delete mode 100644 prototypes/benchmark_datastream.jl delete mode 100644 prototypes/dataframe_blocks.jl delete mode 100644 prototypes/datastream.jl delete mode 100644 prototypes/doc/09_datastreams.md delete mode 100644 prototypes/indexing.jl delete mode 100644 prototypes/namedarray.jl delete mode 100644 prototypes/test_dataframe_blocks.jl delete mode 100644 prototypes/test_datastream.jl delete mode 100644 prototypes/test_indexing.jl delete mode 100644 spec/FunctionReference.md delete mode 100644 spec/JuliaChanges.md delete mode 100644 spec/basics.md delete mode 100644 spec/show.md delete mode 100644 sphinxdoc/other/design_details.rst delete mode 100644 sphinxdoc/other/function_reference_guide.rst delete mode 100644 sphinxdoc/other/specification.rst diff --git a/doc/other/03_design_details.md b/doc/other/03_design_details.md deleted file mode 100644 index 5652298749..0000000000 --- a/doc/other/03_design_details.md +++ /dev/null @@ -1,708 +0,0 @@ -# The Design of DataFrames - -## The Type Hierarchy - -Before we do anything else, let's go through the hierarchy of types introduced by the DataFrames package. This type hierarchy is depicted visually in the figures at the end of this section and can be summarized in a simple nested list: - -* NAtype -* AbstractDataVector - * DataVector - * PooledDataVector -* AbstractMatrix - * DataMatrix -* AbstractDataArray - * DataArray -* AbstractDataFrame - * DataFrame -* AbstractDataStream - * FileDataStream - * DataFrameDataStream - * MatrixDataStream - -We'll step through each element of this hierarchy in turn in the following sections. - -![Scalar and Array Types](figures/types1.png) - -![Tabular Data Types](figures/types2.png) - -## Overview of Basic Types for Working with Data - -There are four new types introduced by the current generation of the DataFrames package: - -* NAType: A scalar value that represents a single missing piece of data. This value behaves much like `NA` in R. -* DataVector: A vector that can contain values of a specific type as well as `NA` values. -* PooledDataVector: An alternative to DataVector's that can be more memory-efficient if a small number of distinc values are present in the underlying vector of data. -* DataFrame: A tabular data structure that is similar to R's `data.frame` and Pandas' `DataFrame`. - -In the future, we will also be introducing generic Arrays of arbitrary dimension. After this, we will provide two new types: - -* DataMatrix: A matrix that can contain values of a specific type as well as `NA` values. -* DataFrame: An array that can contain values of a specific type as well as `NA` values. - -# The NA Type - -The core problem with using the data structures built into Julia for data analysis is that there is no mechanism for expressing the absence of data. Traditional database systems express the absence of data using a `NULL` value, while data analysis packages typically follow the tradition set by S and use `NA` for this purpose when referring to data. (NB: _In S and R, `NULL` is present in addition to `NA`, but it refers to the absence of any specific value for a variable in code, rather than the absence of any specific value for something inside of a data set._) - -The DataFrames package expresses the absence of data by introducing a new type called `NAtype`. This value is used everywhere to indicate missingness in the underlying data set. - -To see this value, you can type - - NAtype - -in the Julia REPL. You can learn more about the nature of this new type using standard Julia functions for navigating Julia's type system: - - typeof(NAtype) - - super(NAtype) - - dump(NAtype) - -While the `NAtype` provides the essential type needed to express missingness, the practical way that missing data is denoted uses a special constant `NA`, which is an instance of `NAtype`: - - NA - NAtype() - -You can explore this value to confirm that `NA` is just an instance of the `NAtype`: - - typeof(NA) - - dump(NA) - -Simply being able to express the notion that a data point is missing is important, but we're ultimately not interested in just expressing data: we want to build tools for interacting with data that may be missing. In a later section, we'll describe the details of interacting with `NA`, but for now we'll state the defining property of `NA`: _because `NA` expresses ignorance about the value of something, every interaction with `NA` corrupts known values and transforms them into `NA` values_. Below we show how this works for addition: - - 1 + NA - -We'll discuss the subtleties of `NA` values ability to corrupt known values in a later section. For now the essential point is this: `NA` values exist to represent missingness that occurs in scalar data. - -# The DataVector Type - -To express the notion that a complex data structure like an `Array` contains missing entries, we need to construct a new data structure that can contain standard Julia values like `Float64` while also allowing the presence of `NA` values. - -Of course, a Julian `Array{Any}` would allow us to do this: - - {1, NA} - -But consistently using `Any` arrays would make Julia much less efficient. Instead, we want to provide a new data structure that parallels a standard Julia `Array`, while allowing exactly one additional value: `NA`. - -This new data structure is the `DataVector` type. You can construct your first `DataVector` using the following code: - - DataVector[1, NA, 3] - -As you'll see when entering this into the REPL, this snippet of code creates a `3-element DataVector{Int}`. A `DataVector` of type `DataVector{Int}` can store `Int` values or `NA` values. In general, a `DataVector` of type `DataVector{T}` can store values of type `T` or `NA` values. - -This is achieved by a very simple mechanism: a `DataVector{T}` is a new parametric composite type that we've added to Julia that wraps around a standard Julia `Vector` and complements this basic vector with a metadata store that indicates whether any entry of the wrapped vector is missing. In essence, a `DataVector` of type `T` is defined as: - - type DataVector{T} - data::Vector{T} - na::BitVector - end - -This allows us to assess whether any entry of the vector is `NA` at the cost of exactly one additional bit per item. We are able to save space by using `BitArray` instead of an `Array{Bool}`. At present, we store the non-missing data values in a vector called `data` and we store the metadata that indicates which values are missing in a vector called `na`. But end-users should not worry about these implementation details. - -Instead, you can simply focus on the behavior of the `DataVector` type. Let's start off by exploring the basic properties of this new type: - - DataVector - - typeof(DataVector) - typeof(DataVector{Int}) - - super(DataVector) - super(super(DataVector)) - - DataVector.names - -If you want to drill down further, you can always run `dump()`: - - dump(DataVector) - -We're quite proud that the definition of `DataVector` is so simple: it makes it easier for end-users to start contributing code to the DataFrames package. - -# Constructing DataVector's - -Let's focus on ways that you can create new `DataVector`. The simplest possible constructor requires the end-user to directly specify both the underlying data values and the missingness metadata as a `BitVector`: - - dv = DataArray([1, 2, 3], falses(3)) - -This is rather ugly, so we've defined many additional constructors that make it easier to create a new `DataVector`. The first simplification is to ignore the distinction between a `BitVector` and an `Array{Bool, 1}` by allowing users to specify `Bool` values directly: - - dv = DataArray([1, 2, 3], [false, false, false]) - -In practice, this is still a lot of useless typing when all of the values of the new `DataVector` are not missing. In that case, you can just pass a Julian vector: - - dv = DataArray([1, 2, 3]) - -When the values you wish to store in a `DataVector` are sequential, you can cut down even further on typing by using a Julian `Range`: - - dv = DataArray(1:3) - -In contrast to these normal-looking constructors, when some of the values in the new `DataVector` are missing, there is a very special type of constructor you can use: - - dv = DataVector[1, 2, NA, 4] - -_Technical Note: This special type of constructor is defined by overloading the `getindex()` function to apply to values of type `DataVector`._ - -# DataVector's with Special Types - -One of the virtues of using metadata to represent missingness instead of sentinel values like `NaN` is that we can easily define `DataVector` over arbitrary types. For example, we can create `DataVector` that store arbitrary Julia types like `ComplexPair` and `Bool`: - - dv = DataArray([1 + 2im, 3 - 1im]) - - dv = DataArray([true, false]) - -In fact, we can add a new type of our own and then wrap it inside of a new sort of `DataVector`: - - type MyNewType - a::Int - b::Int - c::Int - end - - dv = DataArray([MyNewType(1, 2, 3), MyNewType(2, 3, 4)]) - -Of course, specializing the types of `DataVector` means that we sometimes need to convert between types. Just as Julia has several specialized conversion functions for doing this, the DataFrames package provides conversion functions as well. For now, we have three such functions: - -* `dataint()` -* `datafloat()` -* `databool()` - -Using these, we can naturally convert between types: - - dv = DataArray([1.0, 2.0]) - - dataint(dv) - -In the opposite direction, we sometimes want to create arbitrary length `DataVector` that have a specific type before we insert values: - - dv = DataArray(Float64, 5) - - dv[1] = 1 - -`DataArray` created in this way have `NA` in all entries. If you instead wish to initialize a `DataArray` with standard initial values, you can use one of several functions: - -* datazeros() -* dataones() -* datafalses() -* datatrues() - -Like the similar functions in Julia's Base, we can specify the length and type of these initialized vectors: - - dv = datazeros(5) - dv = datazeros(Int, 5) - - dv = dataones(5) - dv = dataones(Int, 5) - - dv = datafalses(5) - - dv = datatrues(5) - -# The PooledDataArray Type - -On the surface, `PooledDataArray`s look like `DataArray`s, but their implementation allows the efficient storage and manipulation of `DataVector`s and `DataArrays` which only contain a small number of values. Internally, `PooledDataArray`s hold a pool of unique values, and the actual `DataArray` simply indexes into this pool, rather than storing each value individually. - -A `PooledDataArray` can be constructed from an `Array` or `DataArray`, and as with regular `DataArray`s, it can hold `NA` values: - - pda = PooledDataArray([1, 1, 1, 1, 2, 3, 2, 2, 3, 3, 3]) - pda2 = PooledDataArray(DataArray["red", "green", "yellow", "yellow", "red", "orange", "red", "green"]) - -`PooledDataArray`s can also be created empty or with a fixed size and a specific type: - - pda3 = PooledDataArray(String, 2000) # A pooled data array of 2000 strings, intially filled with NAs - pda4 = PooledDataArray(Float64) # An empty pooled data array of floats - -By default, the index into the pool of values is a Uint32, allowing 2^32 possible pool values. If you know that you will only have a much smaller number of unique values, you can specify a smaller reference index type, to save space: - - pda5 = PooledDataArray(String, Uint8, 5000, 2) # Create a 5000x2 array of String values, - # initialized to NA, - # with at most 2^8=256 unique values - -`PooledDataVectors`s can be used as columns in DataFrames. - - -# The DataFrame Type - -While `DataVector` are a very powerful tool for dealing with missing data, they only bring us part of the way towards representing real-world data in Julia. The final missing data structure is a tabular data structure of the sort used in relational databases and spreadsheet software. - -To represent these kinds of tabular data sets, the DataFrames package provides the `DataFrame` type. The `DataFrame` type is a new Julian composite type with just two fields: - -* `columns`: A Julia `Vector{Any}`, each element of which will be a single column of the tabular data. The typical column is of type `DataVector{T}`, but this is not strictly required. -* `colindex`: An `Index` object that allows one to access entries in the columns using both numeric indexing (like a standard Julian `Array`) or key-valued indexing (like a standard Julian `Dict`). The details of the `Index` type will be described later; for now, we just note that an `Index` can easily be constructed from any array of `ByteString`. This array is assumed to specify the names of the columns. For example, you might create an index as follows: `Index(["ColumnA", "ColumnB"])`. - -In the future, we hope that there will be many different types of `DataFrame`-like constructs. But all objects that behave like a `DataFrame` will behave according to the following rules that are enforced by an `AbstractDataFrame` protocol: - -* A DataFrame-like object is a table with `M` rows and `N` columns. -* Every column of a DataFrame-like object has its own type. This heterogeneity of types is the reason that a DataFrame cannot simply be represented using a matrix of `DataVector`. -* Each columns of a DataFrame-like object is guaranteed to have length `M`. -* Each columns of a DataFrame-like object is guaranteed to be capable of storing an `NA` value if one is ever inserted. NB: _There is ongoing debate about whether the columns of a DataFrame should always be `DataVector` or whether the columns should only be converted to `DataVector` if an `NA` is introduced by an assignment operation._ - -# Constructing DataFrame's - -Now that you understand what a `DataFrame` is, let's build one: - - df_columns = {datazeros(5), datafalses(5)} - df_colindex = Index(["A", "B"]) - - df = DataFrame(df_columns, df_colindex) - -In practice, many other constructors are more convenient to use than this basic one. The simplest convenience constructors is to provide only the columns, which will produce default names for all the columns. - - df = DataFrame(df_columns) - -One often would like to construct `DataFrame` from columns which may not yet be `DataVector`. This is possible using the same type of constructor. All columns that are not yet `DataVector` will be converted to `DataVector`: - - df = DataFrame({ones(5), falses(5)}) - -Often one wishes to convert an existing matrix into a `DataFrame`. This is also possible: - - df = DataFrame(ones(5, 3)) - -Like `DataVector`, it is possible to create empty `DataFrame` in which all of the default values are `NA`. In the simplest version, we specify a type, the number of rows and the number of columns: - - df = DataFrame(Int, 10, 5) - -Alternatively, one can specify a `Vector` of types. This implicitly defines the number of columns, but one must still explicitly specify the number of rows: - - df = DataFrame({Int, Float64}, 4) - -When you know what the names of the columns will be, but not the values, it is possible to specify the column names at the time of construction. - -_SHOULD THIS BE `DataFrame(types, nrow, names)` INSTEAD?_ - - DataFrame({Int, Float64}, ["A", "B"], 10) - DataFrame({Int, Float64}, Index(["A", "B"]), 10) # STILL NEED TO MAKE THIS WORK - -A more uniquely Julian way of creating `DataFrame` exploits Julia's ability to quote `Expression` in order to produce behavior like R's delayed evaluation strategy. - - df = DataFrame(quote - A = rand(5) - B = datatrues(5) - end) - -# Accessing and Assigning Elements of DataVector's and DataFrame's - -Because a `DataVector` is a 1-dimensional Array, indexing into it is trivial and behaves exactly like indexing into a standard Julia vector. - - dv = dataones(5) - dv[1] - dv[5] - dv[end] - dv[1:3] - dv[[true, true, false, false, false]] - - dv[1] = 3 - dv[5] = 5.3 - dv[end] = 2.1 - dv[1:3] = [3.2, 3.2, 3.1] - dv[[true, true, false, false, false]] = dataones(2) # SHOULD WE MAKE THIS WORK? - - -In contrast, a DataFrame is a random-access data structure that can be indexed into and assigned to in many different ways. We walk through many of them below. - -## Simple Numeric Indexing - -Index by numbers: - - df = DataFrame(Int, 5, 3) - df[1, 3] - df[1] - - -## Range-Based Numeric Indexing - -Index by ranges: - - df = DataFrame(Int, 5, 3) - - df[1, :] - df[:, 3] - df[1:2, 3] - df[1, 1:3] - df[:, :] - -## Column Name Indexing - -Index by column names: - - df["x1"] - df[1, "x1"] - df[1:3, "x1"] - - df[["x1", "x2"]] - df[1, ["x1", "x2"]] - df[1:3, ["x1", "x2"]] - -# Unary Operators for NA, DataVector's and DataFrame's - -In practice, we want to compute with these new types. The first requirement is to define the basic unary operators: - -* `+` -* `-` -* `!` -* _MISSING: The transpose unary operator_ - -You can see these operators in action below: - - +NA - -NA - !NA - - +dataones(5) - -dataones(5) - !datafalses(5) - -## Binary Operators - -* Arithmetic Operators: - * Scalar Arithmetic: `+`, `-`, `*`, `/`, `^`, - * Array Arithmetic: `+`, `.+`, `-`, `.-`, `.*`, `./`, `.^` -* Bit Operators: `&`, `|`, `$` -* Comparison Operators: - * Scalar Comparisons: `==`, `!=`, `<`, `<=`, `>`, `>=` - * Array Comparisons: `.==`, `.!=`, `.<`, `.<=`, `.>`, `.>=` - -The standard arithmetic operators work on DataVector's when they interact with Number's, NA's or other DataVector's. - - dv = dataones(5) - dv[1] = NA - df = DataFrame(quote - a = 1:5 - end) - -## NA's with NA's - - NA + NA - NA .+ NA - -And so on for `-`, `.-`, `*`, `.*`, `/`, `./`, `^`, `.^`. - -## NA's with Scalars and Scalars with NA's - - 1 + NA - 1 .+ NA - NA + 1 - NA .+ 1 - -And so on for `-`, `.-`, `*`, `.*`, `/`, `./`, `^`, `.^`. - -## NA's with DataVector's - - dv + NA - dv .+ NA - NA + dv - NA .+ dv - -And so on for `-`, `.-`, `*`, `.*`, `/`, `./`, `^`, `.^`. - -## DataVector's with Scalars - - dv + 1 - dv .+ 1 - -And so on for `-`, `.-`, `.*`, `./`, `.^`. - -## Scalars with DataVector's - - 1 + dv - 1 .+ dv - -And so on for `-`, `.-`, `*`, `.*`, `/`, `./`, `^`, `.^`. - -_HOW MUCH SHOULD WE HAVE OPERATIONS W/ DATAFRAMES?_ - - NA + df - df + NA - 1 + df - df + 1 - dv + df # SHOULD THIS EXIST? - df + dv # SHOULD THIS EXIST? - df + df - -And so on for `-`, `.-`, `.*`, `./`, `.^`. - -The standard bit operators work on `DataVector`: - -_TO BE FILLED IN_ - -The standard comparison operators work on `DataVector`: - - NA .< NA - NA .< "a" - NA .< 1 - NA .== dv - - dv .< NA - dv .< "a" - dv .< 1 - dv .== dv - - df .< NA - df .< "a" - df .< 1 - df .== dv # SHOULD THIS EXIST? - df .== df - -## Elementwise Functions - -* `abs` -* `sign` -* `acos` -* `acosh` -* `asin` -* `asinh` -* `atan` -* `atan2` -* `atanh` -* `sin` -* `sinh` -* `cos` -* `cosh` -* `tan` -* `tanh` -* `ceil` -* `floor` -* `round` -* `trunc` -* `signif` -* `exp` -* `log` -* `log10` -* `log1p` -* `log2` -* `exponent` -* `sqrt` - -Standard functions that apply to scalar values of type `Number` return `NA` when applied to `NA`: - - abs(NA) - -Standard functions are broadcast to the elements of `DataVector` and `DataFrame` for elementwise application: - - dv = dataones(5) - df = DataFrame({dv}) - - abs(dv) - abs(df) - -## Pairwise Functions - -* `diff` - -Functions that operate on pairs of entries of a `Vector` work on `DataVector` and insert `NA` where it would be produced by other operator rules: - - diff(dv) - -## Cumulative Functions - -* `cumprod` -* `cumsum` -* `cumsum_kbn` -* MISSING: `cummin` -* MISSING: `cummax` - -Functions that operate cumulatively on the entries of a `Vector` work on `DataVector` and insert `NA` where it would be produced by other operator rules: - - cumprod(dv) - cumsum(dv) - cumsum_kbn(dv) - -## Aggregative Functions - -* `minimum` -* `maximum` -* `prod` -* `sum` -* `mean` -* `median` -* `std` -* `var` -* `fft` -* `norm` - -You can see these in action: - - minimum(dv) - -To broadcast these to individual columns, use the `col*s` versions: - -* `colmins` -* `colmaxs` -* `colprods` -* `colsums` -* `colmeans` -* `colmedians` -* `colstds` -* `colvars` -* `colffts` -* `colnorms` - -You can see these in action: - - colmins(df) - -# Loading Standard Data Sets - -The DataFrames package is easiest to explore if you also install the RDatasets package, which provides access to 570 classic data sets: - - require("RDatasets") - - iris = RDatasets.data("datasets", "iris") - dia = RDatasets.data("ggplot2", "diamonds") - -# Split-Apply-Combine - -The basic mechanism for spliting data is the `groupby()` function, which will produce a `GroupedDataFrame` object that is easiest to interact with by iterating over its entries: - - for df in groupby(iris, "Species") - println("A DataFrame with $(nrow(df)) rows") - end - -The `|>` (pipe) operator for `GroupedDataFrame` allows you to run simple functions on the columns of the induced `DataFrame`. You pass a simple function by producing a symbol with its name: - - groupby(iris, "Species") |> :mean - -Another simple way to split-and-apply (without clear combining) is to use the `map()` function: - - map(df -> mean(df[1]), groupby(iris, "Species")) - -# Reshaping - -If you are looking for the equivalent of the R "Reshape" packages `melt()` and `cast()` functions, you can use `stack()` and `unstack()`. Note that these functions have exactly the oppposite syntax as `melt()` and `cast()`: - - stack(iris, ["Petal.Length", "Petal.Width"]) - -# Model Formulas - -## Design - -Once support for missing data and tabular data structures are in place, we need to begin to develop a version of the model formulas "syntax" used by R. In reality, it is better to regard this "syntax" as a complete domain-specific language (DSL) for describing linear models. For those unfamilar with this DSL, we show some examples below and then elaborate upon them to demonstrate ways in which Julia might move beyond R's formula system. - -Let's consider the simplest sort of linear regression model: how does the height of a child depend upon the height of the child's mother and father? If we let the variable `C` denote the height of the child, `M` the height of the mother and `F` the height of the father, the standard linear model approach in statistics would try to model their relationship using the following equation: `C = a + bM + cF + epsilon`, where `a`, `b` and `c` are fixed constants and `epsilon` is a normally distributed noise term that accounts for the imperfect match between any specific child's height and the predictions based solely on the heights of that child's mother and father. - -In practice, we would fit such a model using a function that performs linear regression for us based on information about the model and the data source. For example, in R we would write `lm(C ~ M + F, data = heights.data)` to fit this model, assuming that `heights.data` refers to a tabular data structure containing the heights of the children, mothers and fathers for which we have data. - -If we wanted to see how the child's height depends only on the mother's height, we would write `lm(C ~ M)`. If we were concerned only about dependence on the father's height, we would write `lm(C ~ H)`. As you can see, we can perform many different statistical analyses using a very consise language for describing those analyses. - -What is that language? The R formula language allows one to specify linear models by specifying the terms that should be included. The language is defined by a very small number of constructs: - -* The `~` operator: The `~` operator separates the pieces of a Formula. For linear models, this means that one specifies the outputs to be predicted on the left-hand side of the `~` and the inputs to be used to make predictions on the right-hand side. -* The `+` operator: If you wish to include multiple predictors in a linear model, you use the `+` operator. To include both the columns `A` and `B` while predicting `C`, you write: `C ~ A + B`. -* The `&` operator: The `&` operator is equivalent to `:` in R. It computes interaction terms, which are really an entirely new column created by combining two existing columns. For example, `C ~ A&B` describes a linear model with only one predictor. The values of this predictor at row `i` is exactly `A[i] * B[i]`, where `*` is the standard arithmetic multiplication operation. Because of the precedence rules for Julia, it was not possible to use a `:` operator without writing a custom parser. -* The `*` operator: The `*` operator is really shorthand because `C ~ A*B` expands to `C ~ A + B + A:B`. In other words, in a DSL with only three operators, the `*` is just syntactic sugar. - -In addition to these operators, the model formulas DSL typically allows us to include simple functions of single columns such as in the example, `C ~ A + log(B)`. - -For Julia, this DSL will be handled by constructing an object of type `Formula`. It will be possible to generate a `Formula` using explicitly quoted expression. For example, we might write the Julian equivalent of the models above as `lm(:(C ~ M + F), heights_data)`. A `Formula` object describes how one should convert the columns of a `DataFrame` into a `ModelMatrix`, which fully specifies a linear model. _MORE DETAILS NEEDED ABOUT HOW `ModelMatrix` WORKS._ - -How can Julia move beyond R? The primary improvement Julia can offer over R's model formula approach involves the use of hierarchical indexing of columns to control the inclusion of groups of columns as predictors. For example, a text regression model that uses word counts for thousands of different words as columns in a `DataFrame` might involve writing `IsSpam ~ Pronouns + Prepositions + Verbs` to exclude most words from the analysis except for those included in the `Pronouns`, `Prepositions` and `Verbs` groups. In addition, we might try to improve upon some of the tricks R provides for writing hierarchical models in which each value of a categorical predictor gets its own coefficients. This occurs, for example, in hierarchical regression models of the sort implemented by R's `lmer` function. In addition, there are plans to support multiple LHS and RHS components of a `Formula` using a `|` operator. - -## Implementation - -DETAILS NEEDED - -# Factors - -## Design - -As noted above, statistical data often involves that are not quantitative, but qualitative. Such variables are typically called categorical variables and can take on only a finite number of different values. For example, a data set about people might contain demographic information such as gender or nationality for which we can know the entire set of possible values in advance. Both gender and nationality are categorical variables and should not be represented using quantitative codes unless required as this is confusing to the user and mathematically suspect since the numbering used is entirely artificial. - -In general, we can require that a `Factor` type allow us to express variables that can take on a known, finite list of values. This finite list is called the levels of a `Factor`. In this sense, a `Factor` is like an enumeration type. - -What makes a `Factor` more specialized than an enumeration type is that modeling tools can interpret factors using indicator variables. This is very important for specifying regression models. For example, if we run a regression in which the right-hand side includes a gender `Factor`, the regression function can replace this factor with two dummy variable columns that encode the levels of this factor. (In practice, there are additional complications because of issues of identifiability or collinearity, but we ignore those for the time being and address them in the Implementation section.) - -In addition to the general `Factor` type, we might also introduce a subtype of the `Factor` type that encodes ordinal variables, which are categorical variables that encode a definite ordering such as the values, "very unhappy", "unhappy", "indifferent", "happy" and "very happy". By introducing an `OrdinalFactor` type in which the levels of this sort of ordinal factor are represented in their proper ordering, we can provide specialized functionality like ordinal logistic regression that go beyond what is possible with `Factor` types alone. - -## Implementation - -We have a `Factor` type that handles `NA`s. This type is currently implemented using `PooledDataVector`. - -# DataStreams - -## Specification of DataStream as an Abstract Protocol - -A `DataStream` object allows one to abstractly write code that processes streaming data, which can be used for many things: - -* Analysis of massive data sets that cannot fit in memory -* Online analysis in which interim answers are required while an analysis is still underway - -Before we begin to discuss the use of `DataStream` in Julia, we need to distinguish between streaming data and online analysis: - -* Streaming data involves low memory usage access to a data source. Typically, one demands that a streaming data algorithm use much less memory than would be required to simply represent the full raw data source in main memory. -* Online analysis involves computations on data for which interim answers must be available. For example, given a list of a trillion numbers, one would like to have access to the estimated mean after seeing only the first _N_ elements of this list. Online estimation is essential for building practical statistical systems that will be deployed in the wild. Online analysis is the _sine qua non_ of active learning, in which a statistical system selects which data points it will observe next. - -In Julia, a `DataStream` is really an abstract protocol implemented by all subtypes of the abstract type, `AbstractDataStream`. This protocol assumes the following: - -* A `DataStream` provides a connection to an immutable source of data that implements the standard iterator protocol use throughout Julia: - * `start(iter)`: Get initial iteration state. - * `next(iter, state)`: For a given iterable object and iteration state, return the current item and the next iteration state. - * `done(iter, state)`: Test whether we are done iterating. -* Each call to `next()` causes the `DataStream` object to read in a chunk of rows of tabular data from the streaming source and store these in a `DataFrame`. This chunk of data is called a minibatch and its maximum size is specified at the time the DataStream is created. It defaults to _1_ if no size is explicitly specified. -* All rows from the data source must use the same tabular schema. Entries may be missing, but this missingness must be represented explicitly by the `DataStream` using `NA` values. - -Ultimately, we hope to implement a variety of `DataStream` types that wrap access to many different data sources like CSV files and SQL databases. At present, have only implemented the `FileDataStream` type, which wraps access to a delimited file. In the future, we hope to implement: - -* MatrixDataStream -* DataFrameDataStream -* SQLDataStream -* Other tabular data sources like Fixed Width Files - -Thankfully the abstact `DataStream` protocol allows one to specify algorithms without regard for the specific type of `DataStream` being used. NB: _NoSQL databases are likely to be difficult to support because of their flexible schemas. We will need to think about how to interface with such systems in the future._ - -## Constructing DataStreams - -The easiest way to construct a `DataStream` is to specify a filename: - - ds = DataStream("my_data_set.csv") - -You can then iterate over this `DataStream` to see how things work: - - for df in ds - print(ds) - end - -## Use Cases for DataStreams: - -We can compute many useful quantities using `DataStream`: - -* _Means_: `colmeans(ds)` -* _Variances_: `colvars(ds)` -* _Covariances_: `cov(ds)` -* _Correlations_: `cor(ds)` -* _Unique element lists and counts_: _MISSING_ -* _Linear models_: _MISSING_ -* _Entropy_: _MISSING_ - -## Advice on Deploying DataStreams - -* Many useful computations in statistics can be done online: - * Estimation of means, including implicit estimation of means in Reinforcement Learning - * Estimation of entropy - * Estimation of linear regression models -* But many other computations cannot be done online because they require completing a full pass through the data before quantities can be computed exactly. -* Before writing a DataStream algorith, ask yourself: "what is the performance of this algorithm if I only allow it to make one pass through the data?" - -## References - -* McGregor: Crash Course on Data Stream Algorithms -* Muthukrishnan : Data Streams - Algorithms and Applications -* Chakrabarti: CS85 - Data Stream Algorithms -* Knuth: Art of Computer Programming - -# Ongoing Debates about NA's - -* What are the proper rules for the propagation of missingness? It is clear that there is no simple absolute rule we can follow, but we need to formulate some general principles for how to set reasonable defaults. R's strategy seems to be: - * For operations on vectors, `NA` values are absolutely poisonous by default. - * For operations on `data.frames`, `NA` values are absolutely poisonous on a column-by-column basis by default. This stems from a more general which assumes that most operations on `data.frame` reduce to the aggregation of the same operation performed on each column independently. - * Every function should provide an `na.rm` option that allows one to ignore `NA` values. Essentially this involves replacing `NA` by the identity element for that function: `sum(na.rm = TRUE)` replaces `NA` values with `0`, while `prod(na.rm = TRUE)` replaces `NA` values with `1`. -* Should there be multiple types of missingness? - * For example, SAS distinguishes between: - * Numeric missing values - * Character missing values - * Special numeric missing values - * In statistical theory, while the _fact_ of missingness is simple and does not involve multiple types of `NA` values, the _cause_ of missingness can be different for different data sets, which leads to very different procedures that can appropriately be used. See, for example, the different suggestions in Little and Rubin (2002) about how to treat data that has entries missing completely at random (MCAR) vs. data that has entries missing at random (MAR). Should we be providing tools for handling this? External data sources will almost never provide this information, but multiple dispatch means that Julian statistical functions could insure that the appropriate computations are performed for properly typed data sets without the end-user ever understanding the process that goes on under the hood. -* How is missingness different from `NaN` for `Float`? Both share poisonous behavior and `NaN` propagation is very efficient in modern computers. This can provide a clever method for making `NA` fast for `Float`, but does not apply to other types and seems potentially problematic as two different concepts are now aliased. For example, we are not uncertain about the value of `0/0` and should not allow any method to impute a value for it -- which any imputation method will do if we treat every `NaN` as equivalent to a `NA`. -* Should cleverness ever be allowed in propagation of `NA`? In section 3.3.4 of the R Language Definition, they note that in cases where the result of an operation would be the same for all possible values that an `NA` value could take on, the operation may return this constant value rather than return `NA`. For example, `FALSE & NA` returns `FALSE` while `TRUE | NA` returns `TRUE`. This sort of cleverness seems like a can-of-worms. - -## Ongoing Debates about DataFrame's - -* How should RDBMS-like indices be implemented? What is most efficient? How can we avoid the inefficient vector searches that R uses? -* How should `DataFrame` be distributed for parallel processing? diff --git a/doc/other/04_specification.md b/doc/other/04_specification.md deleted file mode 100644 index d49c3fd758..0000000000 --- a/doc/other/04_specification.md +++ /dev/null @@ -1,183 +0,0 @@ -# Formal Specification of DataFrames Data Structures - -* Type Definitions and Type Hierarchy -* Constructors -* Indexing (Refs / Assigns) -* Operators - * Unary Operators: - * `+`, `-`, `!`, `'` - * Elementary Unary Functions - * `abs`, ... - * Binary Operators: - * Arithmetic Operators: - * Scalar Arithmetic: `+`, `-`, `*`, `/`, `^`, - * Array Arithmetic: `+`, `.+`, `-`, `.-`, `.*`, `./`, `.^` - * Bit Operators: `&`, `|`, `$` - * Comparison Operators: - * Scalar Comparisons: `==`, `!=`, `<`, `<=`, `>`, `>=` - * Array Comparisons: `.==`, `.!=`, `.<`, `.<=`, `.>`, `.>=` -* Container Operations -* Broadcasting / Recycling -* Type Promotion and Conversion -* String Representations -* IO -* Copying -* Properties - * size - * length - * ndims - * eltype -* Predicates -* Handling NA's -* Iteration -* Miscellaneous - -## The NAtype - -### Behavior under Unary Operators - -The unary operators - -### Behavior under Unary Operators - -The unary operators - -### Behavior under Arithmetic Operators - -# Constructors - -* NA's - * Constructor: `NAtype()` - * Const alias: `NA` -* DataVector's - * From (Vector, BitVector): `DataArray([1, 2, 3], falses(3))` - * From (Vector, Vector{Bool}): `DataArray([1, 2, 3], [false, false, false])` - * From (Vector): `DataArray([1, 2, 3])` - * From (BitVector, BitVector): `DataArray(trues(3), falses(3))` - * From (BitVector): `DataArray(trues(3))` - * From (Range1): `DataArray(1:3)` - * From (DataVector): `DataArray(DataArray([1, 2, 3]))` - * From (Type, Int): `DataArray(Int, 3)` - * From (Int): `DataArray(3)` (Type defaults to Float64) - * From (): `DataArray()` (Type defaults to Float64, length defaults to 0) - * Initialized with Float64 zeros: `datazeros(3)` - * Initialized with typed zeros: `datazeros(Int, 3)` - * Initialized with Float64 ones: `dataones(3)` - * Initialized with typed ones: `dataones(Int, 3)` - * Initialized with falses: `datafalses(3)` - * Initialized with trues: `datatrues(3)` - * Literal syntax: `DataVector[1, 2, NA]` -* PooledDataVector's - * From (Vector, BitVector): `PooledDataArray([1, 2, 3], falses(3))` - * From (Vector, Vector{Bool}): `PooledDataArray([1, 2, 3], [false, false, false])` - * From (Vector): `PooledDataArray([1, 2, 3])` - * From (BitVector, BitVector): `PooledDataArray(trues(3), falses(3))` - * From (BitVector, Vector{Bool}): `PooledDataArray(trues(3), [false, false, false])` - * From (BitVector): `PooledDataArray(trues(3))` - * From (Range1): `PooledDataArray(1:3)` - * From (DataVector): `PooledDataArray(DataArray([1, 2, 3]))` - * From (Type, Int): `PooledDataArray(Int, 3)` - * From (Int): `PooledDataArray(3)` (Type defaults to Float64) - * From (): `PooledDataArray()` (Type defaults to Float64, length defaults to 0) - * Initialized with Float64 zeros: `pdatazeros(3)` - * Initialized with typed zeros: `pdatazeros(Int, 3)` - * Initialized with Float64 ones: `pdataones(3)` - * Initialized with typed ones: `pdataones(Int, 3)` - * Initialized with falses: `pdatafalses(3)` - * Initialized with trues: `pdatatrues(3)` - * Literal syntax: `PooledDataVector[1, 2, NA]` -* DataMatrix - * From (Array, BitArray): `DataMatrix([1 2; 3 4], falses(2, 2))` - * From (Array, Array{Bool}): `DataMatrix([1 2; 3 4], [false false; false false])` - * From (Array): `DataMatrix([1 2; 3 4])` - * From (BitArray, BitArray): `DataMatrix(trues(2, 2), falses(2, 2))` - * From (BitArray): `DataMatrix(trues(2, 2))` - * From (DataVector...): `DataMatrix(DataVector[1, NA], DataVector[NA, 2])` - * From (Range1...): `DataMatrix(1:3, 1:3)` - * From (DataMatrix): `DataMatrix(DataArray([1 2; 3 4]))` - * From (Type, Int, Int): `DataMatrix(Int, 2, 2)` - * From (Int, Int): `DataMatrix(2, 2)` (Type defaults to Float64) - * From (): `DataMatrix()` (Type defaults to Float64, length defaults to (0, 0)) - * Initialized with Float64 zeros: `dmzeros(2, 2)` - * Initialized with typed zeros: `dmzeros(Int, 2, 2)` - * Initialized with Float64 ones: `dmones(2, 2)` - * Initialized with typed ones: `dmones(Int, 2, 2)` - * Initialized with falses: `dmfalses(2, 2)` - * Initialized with trues: `dmtrues(2, 2)` - * Initialized identity matrix: `dmeye(2, 2)` - * Initialized identity matrix: `dmeye(2)` - * Initialized diagonal matrix: `dmdiagm([2, 1])` - * Literal syntax: `DataMatrix[1 2; NA 2]` -* DataFrame - * From (): `DataFrame()` - * From (Vector{Any}, Index): `DataFrame({datazeros(3), dataones(3)}, Index(["A", "B"]))` - * From (Vector{Any}): `DataFrame({datazeros(3), dataones(3)})` - * From (Expr): `DataFrame(quote A = [1, 2, 3, 4] end)` - * From (Matrix, Vector{String}): `DataFrame([1 2; 3 4], ["A", "B"])` - * From (Matrix): `DataFrame([1 2; 3 4])` - * From (Tuple): `DataFrame(dataones(2), datafalses(2))` - * From (Associative): ??? - * From (Vector, Vector, Groupings): ??? - * From (Dict of Vectors): `DataFrame({"A" => [1, 3], "B" => [2, 4]})` - * From (Dict of Vectors, Vector{String}): `DataFrame({"A" => [1, 3], "B" => [2, 4]}, ["A"])` - * From (Type, Int, Int): `DataFrame(Int, 2, 2)` - * From (Int, Int): `DataFrame(2, 2)` - * From (Vector{Types}, Vector{String}, Int): `DataFrame({Int, Float64}, ["A", "B"], 2)` - * From (Vector{Types}, Int): `DataFrame({Int, Float64}, 2)` - -# Indexing - -Types on indices: - - NA - - dv = datazeros(10) - - dv[1] - - dv[1:2] - - dv[:] - - dv[[1, 2 3]] - - dv[[false, false, true, false, false]] - - dmzeros(10) - -Indexers: Int, Range, Colon, Vector{Int}, Vector{Bool}, String, Vector{String} - -DataVector's and PooledDataVector's implement: - -* Int -* Range -* Colon -* Vector{Int} -* Vector{Bool} - -DataMatrix's implement the Cartesian product: - -* Int, Int -* Int, Range -* Int, Colon -* Int, Vector{Int} -* Int, Vector{Bool} -... -* Vector{Bool}, Int -* Vector{Bool}, Range -* Vector{Bool}, Colon -* Vector{Bool}, Vector{Int} -* Vector{Bool}, Vector{Bool} - -Single Int access? - -DataFrame's add two new indexer types: - -* String -* Vector{String} - -These can only occur as (a) the only indexer or (b) in the second slot of a paired indexer - -Anything that can be getindex()'d can also be setindex!()'d - -Where do we allow Expr indexing? diff --git a/doc/other/05_function_reference_guide.md b/doc/other/05_function_reference_guide.md deleted file mode 100644 index 0421566700..0000000000 --- a/doc/other/05_function_reference_guide.md +++ /dev/null @@ -1,469 +0,0 @@ -# Function Reference Guide - -## DataFrames - -#### `DataFrame(cols::Vector, colnames::Vector{ByteString})` - -Construct a DataFrame from the columns given by `cols` with the index -generated by `colnames`. A DataFrame inherits from -`Associative{Any,Any}`, so Associative operations should work. Columns -are vector-like objects. Normally these are AbstractDataVector's (DataVector's -or PooledDataVector's), but they can also (currently) include standard -Julia Vectors. - -#### `DataFrame(cols::Vector)` - -Construct a DataFrame from the columns given by `cols` with default -column names. - -#### `DataFrame()` - -An empty DataFrame. - -#### `copy(df::DataFrame)` - -A shallow copy of `df`. Columns are referenced, not copied. - -#### `deepcopy(df::DataFrame)` - -A deep copy of `df`. Copies of each column are made. - -#### `similar(df::DataFrame, nrow)` - -A new DataFrame with `nrow` rows and the same column names and types as `df`. - - -### Basics - -#### `size(df)`, `ndims(df)` - -Same meanings as for Arrays. - -#### `has(df, key)`, `get(df, key, default)`, `keys(df)`, and `values(df)` - -Same meanings as Associative operations. `keys` are column names; -`values` are column contents. - -#### `start(df)`, `done(df,i)`, and `next(df,i)` - -Methods to iterate over columns. - -#### `ncol(df::AbstractDataFrame)` - -Number of columns in `df`. - -#### `nrow(df::AbstractDataFrame)` - -Number of rows in `df`. - -#### `length(df::AbstractDataFrame)` - -Number of columns in `df`. - -#### `isempty(df::AbstractDataFrame)` - -Whether the number of columns equals zero. - -#### `head(df::AbstractDataFrame)` and `head(df::AbstractDataFrame, i::Int)` - -First `i` rows of `df`. Defaults to 6. - -#### `tail(df::AbstractDataFrame)` and `tail(df::AbstractDataFrame, i::Int)` - -Last `i` rows of `df`. Defaults to 6. - -#### `show(io, df::AbstractDataFrame)` - -Standard pretty-printer of `df`. Called by `print()` and the REPL. - -#### `dump(df::AbstractDataFrame)` - -Show the structure of `df`. Like R's `str`. - -#### `describe(df::AbstractDataFrame)` - -Show a description of each column of `df`. - -#### `complete_cases(df::AbstractDataFrame)` - -A Vector{Bool} of indexes of complete cases in `df` (rows with no -NA's). - -#### `duplicated(df::AbstractDataFrame)` - -A Vector{Bool} of indexes indicating rows that are duplicates of prior -rows. - -#### `unique(df::AbstractDataFrame)` - -DataFrame with unique rows in `df`. - - -### Indexing, Assignment, and Concatenation - -DataFrames are indexed like a Matrix and like an Associative. Columns -may be indexed by column name. Rows do not have names. Referencing -with one argument normally indexes by columns: `df["col"]`, -`df[["col1","col3"]]` or `df[i]`. With two arguments, rows and columns -are selected. Indexing along rows works like Matrix indexing. Indexing -along columns works like Matrix indexing with the addition of column -name access. - -#### `getindex(df::DataFrame, ind)` or `df[ind]` - -Returns a subset of the columns of `df` as specified by `ind`, which -may be an `Int`, a `Range`, a `Vector{Int}`, `ByteString`, or -`Vector{ByteString}`. Columns are referenced, not copied. For a -single-element `ind`, the column by itself is returned. - -#### `getindex(df::DataFrame, irow, icol)` or `df[irow,icol]` - -Returns a subset of `df` as specified by `irow` and `icol`. `irow` may -be an `Int`, a `Range`, or a `Vector{Int}`. `icol` may be an `Int`, a -`Range`, or a `Vector{Int}`, `ByteString`, or, `ByteString`, or -`Vector{ByteString}`. For a single-element `ind`, the column subset by -itself is returned. - -#### `index(df::DataFrame)` - -Returns the column `Index` for `df`. - -#### `set_group(df::DataFrame, newgroup, names::Vector{ByteString})` -#### `get_groups(df::DataFrame)` -#### `set_groups(df::DataFrame, gr::Dict)` - -See the Indexing section for these operations on column indexes. - -#### `colnames(df::DataFrame)` or `names(df::DataFrame)` - -The column names as an `Array{ByteString}` - -#### `setindex!(df::DataFrame, newcol, colname)` or `df[colname] = newcol` - -Replace or add a new column with name `colname` and contents `newcol`. -Arrays are converted to DataVector's. Values are recycled to match the -number of rows in `df`. - -#### `insert!(df::DataFrame, index::Integer, item, name)` - -Insert a column of name `name` and with contents `item` into `df` at -position `index`. - -#### `insert!(df::DataFrame, df2::DataFrame)` - -Insert columns of `df2` into `df1`. - -#### `del!(df::DataFrame, cols)` - -Delete columns in `df` at positions given by `cols` (noted with any -means that columns can be referenced). - -#### `del(df::DataFrame, cols)` - -Nondestructive version. Return a DataFrame based on the columns in -`df` after deleting columns specified by `cols`. - -#### `deleterows!(df::DataFrame, inds)` - -Delete rows at positions specified by `inds` from the given DataFrame. - -#### `cbind(df1, df2, ...)` or `hcat(df1, df2, ...)` or `[df1 df2 ...]` - -Concatenate columns. Duplicated column names are adjusted. - -#### `rbind(df1, df2, ...)` or `vcat(df1, df2, ...)` or `[df1, df2, ...]` - -Concatenate rows. - -### I/O - -#### `csvDataFrame(filename, o::Options)` - -Return a DataFrame from file `filename`. Options `o` include -`colnames` (`"true"`, `"false"`, or `"check"` (the default)) and -`poolstrings` (`"check"` (default) or `"never"`). - -### Expression/Function Evaluation in a DataFrame - -#### `with(df::AbstractDataFrame, ex::Expr)` - -Evaluate expression `ex` with the columns in `df`. - -#### `within(df::AbstractDataFrame, ex::Expr)` - -Return a copy of `df` after evaluating expression `ex` with the -columns in `df`. - -#### `within!(df::AbstractDataFrame, ex::Expr)` - -Modify `df` by evaluating expression `ex` with the columns in `df`. - -#### `based_on(df::AbstractDataFrame, ex::Expr)` - -Return a new DataFrame based on evaluating expression `ex` with the -columns in `df`. Often used for summarizing operations. - -#### `colwise(f::Function, df::AbstractDataFrame)` -#### `colwise(f::Vector{Function}, df::AbstractDataFrame)` - -Apply `f` to each column of `df`, and return the results as an -Array{Any}. - -#### `colwise(df::AbstractDataFrame, s::Symbol)` -#### `colwise(df::AbstractDataFrame, s::Vector{Symbol})` - -Apply the function specified by Symbol `s` to each column of `df`, and -return the results as a DataFrame. - -### SubDataFrames - -#### `sub(df::DataFrame, r, c)` -#### `sub(df::DataFrame, r)` - -Return a SubDataFrame with references to rows and columns of `df`. - - -#### `sub(sd::SubDataFrame, r, c)` -#### `sub(sd::SubDataFrame, r)` - -Return a SubDataFrame with references to rows and columns of `df`. - -#### `getindex(sd::SubDataFrame, r, c)` or `sd[r,c]` -#### `getindex(sd::SubDataFrame, c)` or `sd[c]` - -Referencing should work the same as DataFrames. - - -### Grouping - -#### `groupby(df::AbstractDataFrame, cols)` - -Return a GroupedDataFrame based on unique groupings indicated by the -columns with one or more names given in `cols`. - -#### `start(gd)`, `done(gd,i)`, and `next(gd,i)` - -Methods to iterate over GroupedDataFrame groupings. - -#### `getindex(gd::GroupedDataFrame, idx)` or `gd[idx]` - -Reference a particular grouping. Referencing returns a SubDataFrame. - -#### `with(gd::GroupedDataFrame, ex::Expr)` - -Evaluate expression `ex` with the columns in `gd` in each grouping. - -#### `within(gd::GroupedDataFrame, ex::Expr)` -#### `within!(gd::GroupedDataFrame, ex::Expr)` - -Return a DataFrame with the results of evaluating expression `ex` with -the columns in `gd` in each grouping. - -#### `based_on(gd::GroupedDataFrame, ex::Expr)` - -Sweeps along groups and applies `based_on` to each group. Returns a -DataFrame. - -#### `map(f::Function, gd::GroupedDataFrame)` - -Apply `f` to each grouping of `gd` and return the results in an Array. - -#### `colwise(f::Function, gd::GroupedDataFrame)` -#### `colwise(f::Vector{Function}, gd::GroupedDataFrame)` - -Apply `f` to each column in each grouping of `gd`, and return the -results as an Array{Any}. - -#### `colwise(gd::GroupedDataFrame, s::Symbol)` -#### `colwise(gd::GroupedDataFrame, s::Vector{Symbol})` - -Apply the function specified by Symbol `s` to each column of in each -grouping of `gd`, and return the results as a DataFrame. - -#### `by(df::AbstractDataFrame, cols, s::Symbol)` or `groupby(df, cols) |> s` -#### `by(df::AbstractDataFrame, cols, s::Vector{Symbol})` - -Return a DataFrame with the results of grouping on `cols` and -`colwise` evaluation based on `s`. Equivalent to `colwise(groupby(df, -cols), s)`. - -#### `by(df::AbstractDataFrame, cols, e::Expr)` or `groupby(df, cols) |> e` - -Return a DataFrame with the results of grouping on `cols` and -evaluation of `e` in each grouping. Equivalent to `based_on(groupby(df, -cols), e)`. - -### Reshaping / Merge - -#### `stack(df::DataFrame, cols)` - -For conversion from wide to long format. Returns a DataFrame with -stacked columns indicated by `cols`. The result has column `"key"` -with column names from `df` and column `"value"` with the values from -`df`. Columns in `df` not included in `cols` are duplicated along the -stack. - -#### `unstack(df::DataFrame, ikey, ivalue, irefkey)` - -For conversion from long to wide format. Returns a DataFrame. `ikey` -indicates the key column--unique values in column `ikey` will be -column names in the result. `ivalue` indicates the value column. -`irefkey` is the column with a unique identifier for that . Columns -not given by `ikey`, `ivalue`, or `irefkey` are currently ignored. - -#### `merge(df1::DataFrame, df2::DataFrame, bycol)` -#### `merge(df1::DataFrame, df2::DataFrame, bycol, jointype)` - -Return the database join of `df1` and `df2` based on the column `bycol`. -Currently only a single merge key is supported. Supports `jointype` of -"inner" (the default), "left", "right", or "outer". - - -## Index - -#### `Index()` -#### `Index(s::Vector{ByteString})` - -An Index with names `s`. An Index is like an Associative type. An -Index is used for column indexing of DataFrames. An Index maps -ByteStrings and Vector{ByteStrings} to Indices. - -#### `length(x::Index)`, `copy(x::Index)`, `has(x::Index, key)`, `keys(x::Index)`, `push!(x::Index, name)` - -Normal meanings. - -#### `del(x::Index, idx::Integer)`, `del(x::Index, s::ByteString)`, - -Delete the name `s` or name at position `idx` in `x`. - -#### `names(x::Index)` - -A Vector{ByteString} with the names of `x`. - -#### `names!(x::Index, nm::Vector{ByteString})` - -Set names `nm` in `x`. - -#### `rename(x::Index, f::Function)` -#### `rename(x::Index, nd::Associative)` -#### `rename(x::Index, from::Vector, to::Vector)` - -Replace names in `x`, by applying function `f` to each name, -by mapping old to new names with a dictionary (Associative), or using -`from` and `to` vectors. - -#### `getindex(x::Index, idx)` or `x[idx]` - -This does the mapping from name(s) to Indices (positions). `idx` may -be ByteString, Vector{ByteString}, Int, Vector{Int}, Range{Int}, -Vector{Bool}, AbstractDataVector{Bool}, or AbstractDataVector{Int}. - -#### `set_group(idx::Index, newgroup, names::Vector{ByteString})` - -Add a group to `idx` with name `newgroup` that includes the names in -the vector `names`. - -#### `get_groups(idx::Index)` - -A Dict that maps the name of each group to the names in the group. - -#### `set_groups(idx::Index, gr::Dict)` - -Set groups in `idx` based on the mapping given by `gr`. - - -## Missing Values - -Missing value behavior is implemented by instantiations of the `AbstractDataVector` -abstract type. - -#### `NA` - -A constant indicating a missing value. - -#### `isna(x)` - -Return a `Bool` or `Array{Bool}` (if `x` is an `AbstractDataVector`) -that is `true` for elements with missing values. - -#### `nafilter(x)` - -Return a copy of `x` after removing missing values. - -#### `nareplace(x, val)` - -Return a copy of `x` after replacing missing values with `val`. - -#### `naFilter(x)` - -Return an object based on `x` such that future operations like `mean` -will not include missing values. This can be an iterator or other -object. - -#### `naReplace(x, val)` - -Return an object based on `x` such that future operations like `mean` -will replace NAs with `val`. - -#### `na(x)` - -Return an `NA` value appropriate for the type of `x`. - -#### `nas(x, dim)` - -Return an object like `x` filled with `NA` values with size `dim`. - - -## DataVector's - -#### `DataArray(x::Vector)` -#### `DataArray(x::Vector, m::Vector{Bool})` - -Create a DataVector from `x`, with `m` optionally indicating which values -are NA. DataVector's are like Julia Vectors with support for NA's. `x` may -be any type of Vector. - -#### `PooledDataArray(x::Vector)` -#### `PooledDataArray(x::Vector, m::Vector{Bool})` - -Create a PooledDataVector from `x`, with `m` optionally indicating which -values are NA. PooledDataVector's contain a pool of values with references -to those values. This is useful in a similar manner to an R array of -factors. - -#### `size`, `length`, `ndims`, `ref`, `assign`, `start`, `next`, `done` - -All normal Vector operations including array referencing should work. - -#### `isna(x)`, `nafilter(x)`, `nareplace(x, val)`, `naFilter(x)`, `naReplace(x, val)` - -All NA-related methods are supported. - -## Utilities - -#### `cut(x::Vector, breaks::Vector)` - -Returns a PooledDataVector with length equal to `x` that divides values in `x` -based on the divisions given by `breaks`. - -## Formulas and Models - -#### `Formula(ex::Expr)` - -Return a Formula object based on `ex`. Formulas are two-sided -expressions separated by `~`, like `:(y ~ w*x + z + i&v)`. - -#### `model_frame(f::Formula, d::AbstractDataFrame)` -#### `model_frame(ex::Expr, d::AbstractDataFrame)` - -A ModelFrame. - -#### `model_matrix(mf::ModelFrame)` -#### `model_matrix(f::Formula, d::AbstractDataFrame)` -#### `model_matrix(ex::Expr, d::AbstractDataFrame)` - -A ModelMatrix based on `mf`, `f` and `d`, or `ex` and `d`. - -#### `lm(ex::Expr, df::AbstractDataFrame)` - -Linear model results (type OLSResults) based on formula `ex` and `df`. diff --git a/doc/sections/00_table_of_contents.md b/doc/sections/00_table_of_contents.md deleted file mode 100644 index 1b95485c73..0000000000 --- a/doc/sections/00_table_of_contents.md +++ /dev/null @@ -1,11 +0,0 @@ -# Table of Contents - -* Why Use the DataFrames Package? - * Missing Data Points - * Data Structures for Storing Missing Data Points - * Tabular Data Structures - * A Language for Expressing Statistical Models -* Getting Started -* The Design of DataFrames -* Formal Specification of DataFrames -* Function Reference Guide diff --git a/doc/sections/01_introduction.md b/doc/sections/01_introduction.md deleted file mode 100644 index 0b02411fdf..0000000000 --- a/doc/sections/01_introduction.md +++ /dev/null @@ -1,101 +0,0 @@ -# Why Use the DataFrames Package? - -We believe that Julia is the future of technical computing. Nevertheless, -Base Julia is not sufficient for statistical computing. The DataFrames -package (and its sibling, DataArrays) extends Base Julia by introducing three -basic types needed for statistical computing: - -* `NA`: An indicator that a data value is missing -* `DataArray`: An extension to the `Array` type that can contain missing - values -* `DataFrame`: A data structure for representing tabular data sets - -## `NA`: An Indicator for Missing Data Points - -Suppose that we want to calculate the mean of a list of five `Float64` -numbers: `x1`, `x2`, `x3`, `x4` and `x5`. We would normally do this -in Julia as follows: - -* Represent these five numbers as a `Vector`: `v = [x1, x2, x3, x4, x5]` -* Compute the mean of `v` using the `mean()` function - -_But what if one of the five numbers were missing?_ - -The concept of a missing data point cannot be directly expressed in Julia. -In contrast with languages like Java and R, which provide `NULL` and `NA` -values that represent missingness, there is no missing data value in Base -Julia. - -The DataArrays package therefore provides `NA`, which serves as an indicator -that a specific value is missing. In order to exploit Julia's multiple dispatch -rules, `NA` is a singleton object of a new type called `NAtype`. - -Like R's `NA` value and unlike Java's `NULL` value, Julia's `NA` value represents -epistemic uncertainty. This means that operations involving `NA` return `NA` -when the result of the operation cannot be determined, but operations whose -value can be determined despite the presence of `NA` will return a value that -is not `NA`. - -For example, `false && NA` evaluates to `false` and `true || NA` evaluates -to `true`. In contrast, `1 + NA` evaluates to `NA` because the outcome is -uncertain in the absence of knowledge about the missing value represented -by `NA`. - -## `DataArray`: Efficient Arrays with Missing Values - -Although the `NA` value is sufficient for representing missing scalar values, -it cannot be stored efficiently inside of Julia's standard `Array` type. To -represent arrays with potentially missing entries, the DataArrays package -introduces a `DataArray` type. For example, a `DataArray{Float64}` can -contain `Float64` values and `NA` values, but nothing else. In contrast, the -most specific `Array` that can contain both `Float64` and `NA` values is an -`Array{Any}`. - -Except for the ability to store `NA` values, the `DataArray` type is meant to -behave exactly like Julia's standard `Array` type. In particular, `DataArray` -provides two typealiases called `DataVector` and `DataMatrix` that mimic the -`Vector` and `Matrix` typealiases for 1D and 2D `Array` types. - -## `DataFrame`: Tabular Data Sets - -`NA` and `DataArray` provide mechanisms for handling missing values for scalar -types and arrays, but most real world data sets have a tabular structure that -does not correspond to a simple `DataArray`. - -For example, the data table shown below highlights some of the ways in which a -typical data set is not like a `DataArray`: - -![Tabular Data](figures/data.png) - -Note three important properties that this table possesses: - -* The columns of a tabular data set may have different types. A `DataMatrix` - can only contain values of one type: these might all be `String` or `Int`, - but we cannot have one column of `String` type and another column of `Int` - type. -* The values of the entries within a column always have a consistent type. - This means that a single column could be represented using a `DataVector`. - Unfortunately, the heterogeneity of types between columns means that we - need some way of wrapping a group of columns together into a coherent whole. - We could use a standard `Vector` to wrap up all of the columns of the table, - but this will not enforce an important constraint imposed by our intuitions: - _every column of a tabular data set has the same length as all of the other - columns_. -* The columns of a tabular data set are typically named using some sort of - `String`. Often, one wants to access the entries of a data set by using a - combination of verbal names and numeric indices. - -We can summarize these concerns by noting that we face four problems when with -working with tabular data sets: - -* Tabular data sets may have columns of heterogeneous type -* Each column of a tabular data set has a consistent type across all of - its entries -* All of the columns of a tabular data set have the same length -* The columns of a tabular data set should be addressable using both verbal - names and numeric indices - -The DataFrames package solves these problems by adding a `DataFrame` type -to Julia. This type will be familiar to anyone who has worked with R's -`data.frame` type, Pandas' `DataFrame` type, an SQL-style database, or -Excel spreadsheet. diff --git a/doc/sections/02_getting_started.md b/doc/sections/02_getting_started.md deleted file mode 100644 index 9648fd11c9..0000000000 --- a/doc/sections/02_getting_started.md +++ /dev/null @@ -1,134 +0,0 @@ -# Getting Started - -## Installation - -The DataFrames package is available through the Julia package system. Throughout -the rest of this tutorial, we will assume that you have installed the DataFrames -package and have already typed `using DataArrays, DataFrames` to bring all of -the relevant variables into your current namespace. In addition, we will make -use of the `RDatasets` package, which provides access to hundreds of -classical data sets. - -## The `NA` Value - -To get started, let's examine the `NA` value. Type the following into the -REPL: - - NA - -One of the essential properties of `NA` is that it poisons other items. To -see this, try to add something like `1` to `NA`: - - 1 + NA - -## The `DataArray` Type - -Now that we see that `NA` is working, let's insert one into a `DataArray`. -We'll create one now using the `@data` macro: - - dv = @data([NA, 3, 2, 5, 4]) - -To see how `NA` poisons even complex calculations, let's try to take -the mean of the five numbers stored in `dv`: - - mean(dv) - -In many cases we're willing to just ignore `NA` values and remove them -from our vector. We can do that using the `dropna` function: - - dropna(dv) - mean(dropna(dv)) - -Instead of removing `NA` values, you can try to conver the `DataArray` -into a normal Julia `Array` using `convert`: - - convert(Array, dv) - -This fails in the presence of `NA` values, but will succeed if there are -no `NA` values: - - dv[1] = 3 - convert(Array, dv) - -In addition to removing `NA` values and hoping they won't occur, you can -also replace any `NA` values using the `array` function, which takes a -replacement value as an argument: - - dv = @data([NA, 3, 2, 5, 4]) - mean(array(dv, 11)) - -Which strategy for dealing with `NA` values is most appropriate will -typically depend on the specific details of your data analysis pathway. - -Although the examples above employed only 1D `DataArray` objects, the -`DataArray` type defines a completely generic N-dimensional array type. -Operations on generic `DataArray` objects work in higher dimensions in -the same way that they work on Julia's Base `Array` type: - - dm = @data([NA 0.0; 0.0 1.0]) - dm * dm - -## The `DataFrame` Type - -The `DataFrame` type can be used to represent data tables, each column of -which is a `DataArray`. You can specify the columns using keyword arguments: - - df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) - -It is also possible to construct a `DataFrame` in stages: - - df = DataFrame() - df[:A] = 1:8 - df[:B] = ["M", "F", "F", "M", "F", "M", "M", "F"] - df - -The `DataFrame` we build in this way has 8 rows and 2 columns. You -can check this using `size` function: - - nrows = size(df, 1) - ncols = size(df, 2) - -We can also look at small subsets of the data in a couple of different ways: - - head(df) - tail(df) - - df[1:3, :] - -Having seen what some of the rows look like, we can try to summarize the -entire data set using `describe`: - - describe(df) - -To focus our search, we start looking at just the means and medians of -specific columns. In the example below, we use numeric indexing to access -the columns of the `DataFrame`: - - mean(df[1]) - median(df[1]) - -We could also have used column names to access individual columns: - - mean(df[:A]) - median(df[:A]) - -We can also apply a function to each column of a `DataFrame` with the `colwise` -function. For example: - - df = DataFrame(A = 1:4, B = randn(4)) - colwise(cumsum, df) - -## Accessing Classic Data Sets - -To see more of the functionality for working with `DataFrame` objects, we need -a more complex data set to work with. We'll use the `RDatasets` package, which -provides access to many of the classical data sets that are available in R. - -For example, we can access Fisher's iris data set using the following functions: - - using RDatasets - iris = dataset("datasets", "iris") - head(iris) - -In the next section, we'll discuss generic I/O strategy for reading and writing -`DataFrame` objects that you can use to import and export your own data files. diff --git a/doc/sections/03_io.md b/doc/sections/03_io.md deleted file mode 100644 index 834a8dcc87..0000000000 --- a/doc/sections/03_io.md +++ /dev/null @@ -1,96 +0,0 @@ -# DataFrames I/O - -## Importing data from tabular data files - -To read data from a CSV-like file, use the `readtable` function: - - df = readtable("data.csv") - - df = readtable("data.tsv") - - df = readtable("data.wsv") - - df = readtable("data.txt", separator = '\t') - - df = readtable("data.txt", header = false) - -`readtable` requires that you specify the path of the file that you would -like to read as a `String`. It supports many additional keyword arguments: -these are documented in the section on advanced I/O operations. - -## Exporting data to a tabular data file - -To write data to a CSV file, use the `writetable` function: - - df = DataFrame(A = 1:10) - - writetable("output.csv", df) - - writetable("output.dat", df, separator = ',', header = false) - - writetable("output.dat", df, quotemark = '\'', separator = ',') - - writetable("output.dat", df, header = false) - -`writetable` requires the following arguments: - -* `filename::String` -- The path of the file that you wish to write to. -* `df::DataFrame` -- The DataFrame you wish to write to disk. - -Additional advanced options are documented below. - -## Advanced Options for Reading CSV Files - -`readtable` accepts the following optional keyword arguments: - -* `header::Bool` -- Use the information from the file's header line to - determine column names. Defaults to `true`. -* `separator::Char` -- Assume that fields are split by the `separator` character. - If not specified, it will be guessed from the filename: `.csv` defaults to - `','`, `.tsv` defaults to `'\t'`, `.wsv` defaults to `' '`. -* `quotemark::Vector{Char}` -- Assume that fields contained inside of two - `quotemark` characters are quoted, which disables processing of separators and - linebreaks. Set to `Char[]` to disable this feature and slightly improve - performance. Defaults to `['"']`. -* `decimal::Char` -- Assume that the decimal place in numbers is written using - the `decimal` character. Defaults to `'.'`. -* `nastrings::Vector{ASCIIString}` -- Translate any of the strings into this - vector into an `NA`. Defaults to `["", "NA"]`. -* `truestrings::Vector{ASCIIString}` -- Translate any of the strings into - this vector into a Boolean `true`. Defaults to `["T", "t", "TRUE", "true"]`. -* `falsestrings::Vector{ASCIIString}` -- Translate any of the strings into - this vector into a Boolean `true`. Defaults to `["F", "f", "FALSE", "false"]`. -* `makefactors::Bool` -- Convert string columns into `PooledDataVector`'s - for use as factors. Defaults to `false`. -* `nrows::Int` -- Read only `nrows` from the file. Defaults to `-1`, which - indicates that the entire file should be read. -* `names::Vector{Symbol}` -- Use the values in this array as the names - for all columns instead of or in lieu of the names in the file's header. Defaults to `[]`, which indicates that the header should be used if present or that numeric names should be invented if there is no header. -* `cleannames::Bool` -- Call `cleancolnames!` on the resulting DataFrame to - ensure that all column names are valid identifers in Julia. -* `eltypes::Vector{DataType}` -- Specify the types of all columns. Defaults to `[]`. -* `allowcomments::Bool` -- Ignore all text inside comments. Defaults to `false`. -* `commentmark::Char` -- Specify the character that starts comments. Defaults - to `'#'`. -* `ignorepadding::Bool` -- Ignore all whitespace on left and right sides of a - field. Defaults to `true`. -* `skipstart::Int` -- Specify the number of initial rows to skip. Defaults - to `0`. -* `skiprows::Vector{Int}` -- Specify the indices of lines in the input to - ignore. Defaults to `[]`. -* `skipblanks::Bool` -- Skip any blank lines in input. Defaults to `true`. -* `encoding::Symbol` -- Specify the file's encoding as either `:utf8` or - `:latin1`. Defaults to `:utf8`. - -## Advanced Options for Writing CSV Files - -`writetable` accepts the following optional keyword arguments: - -* `separator::Char` -- The separator character that you would like to use. - Defaults to the output of `getseparator(filename)`, which uses commas for - files that end in `.csv`, tabs for files that end in `.tsv` and a single - space for files that end in `.wsv`. -* `quotemark::Char` -- The character used to delimit string fields. Defaults - to `'"'`. -* `header::Bool` -- Should the file contain a header that specifies the column - names from `df`. Defaults to `true`. diff --git a/doc/sections/04_subsets.md b/doc/sections/04_subsets.md deleted file mode 100644 index c8539c55ab..0000000000 --- a/doc/sections/04_subsets.md +++ /dev/null @@ -1,154 +0,0 @@ -# Accessing and Modifying Entries of DataArray and DataFrame Objects - -## DataArrays - -The `DataArray` type is meant to behave like a standard Julia `Array` and -tries to implement identical indexing rules: - -One dimensional `DataArray`: -~~~.jl -julia> using DataArrays - -julia> dv = data([1, 2, 3]) -3-element DataArray{Int64,1}: - 1 - 2 - 3 - -julia> dv[1] -1 - -julia> dv[2] = NA -NA - -julia> dv[2] -NA -~~~ - -Two dimensional `DataArray`: -~~~.jl -julia> using DataArrays - -julia> dm = data([1 2; 3 4]) -2x2 DataArray{Int64,2}: - 1 2 - 3 4 - -julia> dm[1, 1] -1 - -julia> dm[2, 1] = NA -NA - -julia> dm[2, 1] -NA -~~~ - -## DataFrames - -In contrast, a `DataFrame` offers substantially more forms of indexing -because columns can be referred to by name: - -~~~.jl -julia> using DataFrames - -julia> df = DataFrame(A = 1:10, B = 2:2:20) -10x2 DataFrame -| Row # | A | B | -|-------|----|----| -| 1 | 1 | 2 | -| 2 | 2 | 4 | -| 3 | 3 | 6 | -| 4 | 4 | 8 | -| 5 | 5 | 10 | -| 6 | 6 | 12 | -| 7 | 7 | 14 | -| 8 | 8 | 16 | -| 9 | 9 | 18 | -| 10 | 10 | 20 | -~~~ - -Refering to the first column by index or name: -~~~.jl -julia> df[1] -10-element DataArray{Int64,1}: - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - -julia> df[:A] -10-element DataArray{Int64,1}: - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 -~~~ - -Refering to the first element of the first column: -~~~.jl -julia> df[1, 1] -1 - -julia> df[1, :A] -1 -~~~ - -Selecting a subset of rows by index and an (ordered) subset of columns by name: -~~~.jl -julia> df[1:3, [:A, :B]] -3x2 DataFrame -| Row # | A | B | -|-------|----|----| -| 1 | 1 | 2 | -| 2 | 2 | 4 | -| 3 | 3 | 6 | - -julia> df[1:3, [:B, :A]] -3x2 DataFrame -| Row # | B | A | -|-------|---|---| -| 1 | 2 | 1 | -| 2 | 4 | 2 | -| 3 | 6 | 3 | -~~~ - -Selecting a subset of rows by using a condition: -~~~.jl -julia> df[df[:A] % 2 .== 0, :] -5x2 DataFrame -| Row # | A | B | -|-------|----|----| -| 1 | 2 | 4 | -| 2 | 4 | 8 | -| 3 | 6 | 12 | -| 4 | 8 | 16 | -| 5 | 10 | 20 | - -julia> df[df[:B] % 2 .== 0, :] -10x2 DataFrame -| Row # | A | B | -|-------|----|----| -| 1 | 1 | 2 | -| 2 | 2 | 4 | -| 3 | 3 | 6 | -| 4 | 4 | 8 | -| 5 | 5 | 10 | -| 6 | 6 | 12 | -| 7 | 7 | 14 | -| 8 | 8 | 16 | -| 9 | 9 | 18 | -| 10 | 10 | 20 | -~~~ diff --git a/doc/sections/05_joins_and_indexing.md b/doc/sections/05_joins_and_indexing.md deleted file mode 100644 index 116f2e531b..0000000000 --- a/doc/sections/05_joins_and_indexing.md +++ /dev/null @@ -1,54 +0,0 @@ -# Database-Style Joins and Indexing - -## Joining Data Sets Together - -We often need to combine two or more data sets together to provide a complete -picture of the topic we are studying. For example, suppose that we have the -following two data sets: - - names = DataFrame(ID = [1, 2], Name = ["John Doe", "Jane Doe"]) - jobs = DataFrame(ID = [1, 2], Job = ["Lawyer", "Doctor"]) - -We might want to work with a larger data set that contains both the names and -jobs for each ID. We can do this using the `join` function: - - full = join(names, jobs, on = :ID) - -In relational database theory, this operation is generally referred to as a -join. The columns used to determine which rows should be combined during a join -are called keys. - -There are seven kinds of joins supported by the DataFrames package: - -* Inner: The output contains rows for values of the key that exist in both - the first (left) and second (right) arguments to `join`. -* Left: The output contains rows for values of the key that exist in the - first (left) argument to `join`, whether or not that value exists in the - second (right) argument. -* Right: The output contains rows for values of the key that exist in the - second (right) argument to `join`, whether or not that value exists in - the first (left) argument. -* Outer: The output contains rows for values of the key that exist in the - first (left) or second (right) argument to `join`. -* Semi: Like an inner join, but output is restricted to columns from the first - (left) argument to `join`. -* Anti: The output contains rows for values of the key that exist in the first - (left) but not the second (right) argument to `join`. As with semi joins, - output is restricted to columns from the first (left) argument. -* Cross: The output is the cartesian product of rows from the first (left) and - second (right) arguments to `join`. - -You can control the kind of join that `join` performs using the `kind` -keyword argument: - - a = DataFrame(ID = [1, 2], Name = ["A", "B"]) - b = DataFrame(ID = [1, 3], Job = ["Doctor", "Lawyer"]) - join(a, b, on = :ID, kind = :inner) - join(a, b, on = :ID, kind = :left) - join(a, b, on = :ID, kind = :right) - join(a, b, on = :ID, kind = :outer) - join(a, b, on = :ID, kind = :semi) - join(a, b, on = :ID, kind = :anti) - -Cross joins are the only kind of join that does not use a key: - join(a, b, kind = :cross) diff --git a/doc/sections/06_split_apply_combine.md b/doc/sections/06_split_apply_combine.md deleted file mode 100644 index 1168126ed7..0000000000 --- a/doc/sections/06_split_apply_combine.md +++ /dev/null @@ -1,29 +0,0 @@ -# The Split-Apply-Combine Strategy - -Many data analysis tasks involve splitting a data set into groups, applying -some functions to each of the groups and then combining the results. A -standardized framework for handling this sort of computation is described in -the paper, [The Split-Apply-Combine Strategy for Data Analysis](http://www.jstatsoft.org/v40/i01), -written by Hadley Wickham. - -The DataFrames package supports the Split-Apply-Combine strategy through -the `by` function, which takes in three arguments: (1) a DataFrame, (2) a -column to split the DataFrame on, and (3) a function or expression to -apply to each subset of the DataFrame. - -We show several examples of the `by` function applied to the `iris` dataset -below: - - using DataFrames, RDatasets - - iris = dataset("datasets", "iris") - - by(iris, :Species, size) - by(iris, :Species, df -> mean(df[:PetalLength])) - by(iris, :Species, df -> DataFrame(N = size(df, 1))) - -If you only want to split the data set into subsets, use the `groupby` function: - - for subdf in groupby(iris, :Species) - println(size(subdf, 1)) - end diff --git a/doc/sections/07_reshaping_and_pivoting.md b/doc/sections/07_reshaping_and_pivoting.md deleted file mode 100644 index 2a000297d3..0000000000 --- a/doc/sections/07_reshaping_and_pivoting.md +++ /dev/null @@ -1,9 +0,0 @@ -# Reshaping and Pivoting Data - -Reshape data using the `stack` function: - - using DataFrames, RDatasets - - iris = dataset("datasets", "iris") - - stack(iris, :SepalLength) diff --git a/doc/sections/08_sorting.md b/doc/sections/08_sorting.md deleted file mode 100644 index d36431404c..0000000000 --- a/doc/sections/08_sorting.md +++ /dev/null @@ -1,40 +0,0 @@ -# Sorting - -Sorting is a fundamental component of data analysis. Basic sorting is -trivial: just calling `sort!` will sort all columns, in place. - - using DataFrames, RDatasets - - iris = dataset("datasets", "iris") - sort!(iris) - -In Sorting DataFrames, you may want to sort different columns with -different options. Here are some examples showing most of the -possible options. - - sort!(iris, rev = true) - - sort!(iris, cols = [:SepalWidth, :SepalLength]) - - sort!(iris, cols = [order(:Species, by = uppercase), - order(:SepalLength, rev = true)]) - -Keywords used above include `cols` (to specify columns), `rev` (to -sort a column or the whole DataFrame in reverse), and `by` (to apply a -function to a column/DataFrame). Each keyword can either be a single -value, or can be a tuple or array, with values corresponding to -individual columns. - -As an alternative to using array or tuple values, `order` to specify -an ordering for a particular column within a set of columns - -The following two examples show two ways to sort the `iris` dataset -with the same result: `Species` will be ordered in reverse -lexicographic order, and within species, rows will be sorted by -increasing sepal length and width. - - sort!(iris, cols = (:Species, :SepalLength, :SepalWidth), - rev = (true, false, false)) - - sort!(iris, - cols = (order(:Species, rev = true), :SepalLength, :SepalWidth)) diff --git a/doc/sections/10_formulas.md b/doc/sections/10_formulas.md deleted file mode 100644 index f5e44e318b..0000000000 --- a/doc/sections/10_formulas.md +++ /dev/null @@ -1,38 +0,0 @@ -# The Formula, ModelFrame and ModelMatrix Types - -In regression model, we often want to describe the relationship between a -response variable and one or more input variables in terms of main effects -and interactions. To facilitate the specification of a regression model in -terms of the columns of a DataFrame, the DataFrames package provides a -`Formula` type, which is created by the `~` binary operator in Julia: - - fm = Z ~ X + Y - -A `Formula` object can be used to transform a DataFrame into a ModelFrame object: - - df = DataFrame(X = randn(10), Y = randn(10), Z = randn(10)) - mf = ModelFrame(Z ~ X + Y, df) - -A `ModelFrame` object is just a simple wrapper around a `DataFrame`. For -modeling purposes, one generally wants to construct a `ModelMatrix`, which -constructs a `Matrix{Float64}` that can be used directly to fit a -statistical model: - - mm = ModelMatrix(ModelFrame(Z ~ X + Y, df)) - -Note that `mm` contains an additional column consisting entirely of `1.0` -values. This is used to fit an intercept term in a regression model. - -In addition to specifying main effects, it is possible to specify interactions -using the `&` operator inside a `Formula`: - - mm = ModelMatrix(ModelFrame(Z ~ X + Y + X&Y, df)) - -If you would like to specify both main effects and an interaction term at once, -use the `*` operator inside a `Formula`: - - mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) - -The construction of model matrices makes it easy to formulate complex -statistical models. These are used to good effect by the -[GLM package](https://github.com/JuliaStats/GLM.jl). diff --git a/doc/sections/11_pooling.md b/doc/sections/11_pooling.md deleted file mode 100644 index 3c9da9dd22..0000000000 --- a/doc/sections/11_pooling.md +++ /dev/null @@ -1,50 +0,0 @@ -# Representing Factors using the PooledDataArray Type - -Often, we have to deal with factors that take on a small -number of levels: - - dv = @data(["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"]) - -The naive encoding used in a `DataArray` represents every -entry of this vector as a full string. In contrast, we -can represent the data more efficiently by replacing the -strings with indices into a small pool of levels. This is -what the `PooledDataArray` does: - - pdv = @pdata(["Group A", "Group A", "Group A", - "Group B", "Group B", "Group B"]) - -In addition to representing repeated data efficiently, -the `PooledDataArray` allows us to determine the levels -of the factor at any time using the `levels` function: - - levels(pdv) - -By default, a `PooledDataArray` is able to represent -`2^32` differents levels. You can use less memory by -calling the `compact` function: - - pdv = compact(pdv) - -Often, you will have factors encoded inside a DataFrame -with `DataArray` columns instead of `PooledDataArray` -columns. You can do conversion of a single column using -the `pool` function: - - pdv = pool(dv) - -Or you can edit the columns of a `DataFrame` in-place -using the `pool!` function: - - df = DataFrame(A = [1, 1, 1, 2, 2, 2], - B = ["X", "X", "X", "Y", "Y", "Y"]) - pool!(df, [:A, :B]) - -Pooling columns is important for working with the -[GLM package](https://github.com/JuliaStats/GLM.jl). -When fitting regression models, `PooledDataArray` columns -in the input are translated into 0/1 indicator columns -in the `ModelMatrix` -- with one column for each of the levels -of the `PooledDataArray`. This allows one to analyze categorical -data efficiently. diff --git a/prototypes/benchmark_datastream.jl b/prototypes/benchmark_datastream.jl deleted file mode 100644 index 3fc8eced2f..0000000000 --- a/prototypes/benchmark_datastream.jl +++ /dev/null @@ -1,21 +0,0 @@ -filename = Pkg.dir("DataFrames", "test", "data", "big_data.csv") - -minibatch_sizes = [1, 5, 25, 100, 1_000, 10_000] - -for f in (colmeans, colvars, cor) - for minibatch_size in minibatch_sizes - ds = DataStream(filename, minibatch_size) - N = 3 - df = benchmark(() -> apply(f, (ds,)), - "DataStream Functions", - join({ - string(f), - "w/ minibatches of", - minibatch_size, - "rows" - }, " "), - N) - # TODO: Keep permanent record - printtable(df, header=false) - end -end diff --git a/prototypes/dataframe_blocks.jl b/prototypes/dataframe_blocks.jl deleted file mode 100644 index d665202ae5..0000000000 --- a/prototypes/dataframe_blocks.jl +++ /dev/null @@ -1,627 +0,0 @@ -using Blocks - -importall Blocks - -export Block, DDataFrame, as_dataframe, dreadtable, dwritetable, writetable, gather - -type DDataFrame <: AbstractDataFrame - rrefs::Vector - procs::Vector - nrows::Vector - ncols::Int - coltypes::Vector - colindex::Index - - DDataFrame(rrefs::Vector, procs::Vector) = _dims(new(rrefs, procs)) -end - -Base.show(io::IO, dt::DDataFrame) = println("$(nrow(dt))x$(ncol(dt)) DDataFrame. $(length(dt.rrefs)) blocks over $(length(union(dt.procs))) processors") - -gather(dt::DDataFrame) = reduce((x,y)->vcat(fetch(x), fetch(y)), dt.rrefs) -#convert(::Type{DataFrame}, dt::DDataFrame) = reduce((x,y)->vcat(fetch(x), fetch(y)), dt.rrefs) - -# internal methods -function _dims(dt::DDataFrame, rows::Bool=true, cols::Bool=true) - dt.nrows = pmap(x->nrow(fetch(x)), Block(dt)) - cnames = remotecall_fetch(dt.procs[1], (x)->colnames(fetch(x)), dt.rrefs[1]) - dt.ncols = length(cnames) - dt.colindex = Index(cnames) - dt.coltypes = remotecall_fetch(dt.procs[1], (x)->coltypes(fetch(x)), dt.rrefs[1]) - # propagate the column names - for nidx in 2:length(dt.procs) - remotecall(dt.procs[nidx], x->colnames!(fetch(x), cnames), dt.rrefs[nidx]) - end - dt -end - -function as_dataframe(bio::BlockableIO; kwargs...) - kwargs = _check_readtable_kwargs(kwargs...) - #tbl = readtable(bio; kwargs...) - nbytes = -1 - if isa(bio, IOStream) - p1 = position(bio) - seekend(bio) - nbytes = position(bio) - p1 - seek(bio, p1) - elseif isa(bio, IOBuffer) || isa(bio, BlockIO) - nbytes = nb_available(bio) - else - error("can not determine size of stream") - end - - kwdict = { :header => false, - :separator => ',', - #:allowquotes => true, - :quotemark => ['"'], - :decimal => '.', - :nastrings => ASCIIString["", "NA"], - :truestrings => ASCIIString["T", "t", "TRUE", "true"], - :falsestrings => ASCIIString["F", "f", "FALSE", "false"], - :makefactors => false, - :colnames => UTF8String[], - :cleannames => false, - :coltypes => Any[], - :allowcomments => false, - :commentmark => '#', - :ignorepadding => true, - :skipstart => 0, - :skiprows => Int[], - :skipblanks => true, - :encoding => :utf8 } - - for kw in kwargs - kwdict[kw[1]] = kw[2] - end - - poargs = {} - for argname in names(DataFrames.ParseOptions) - push!(poargs, kwdict[argname]) - end - - po = DataFrames.ParseOptions(kwdict[:header], - kwdict[:separator], - kwdict[:quotemark], - kwdict[:decimal], - kwdict[:nastrings], - kwdict[:truestrings], - kwdict[:falsestrings], - kwdict[:makefactors], - kwdict[:colnames], - kwdict[:cleannames], - kwdict[:coltypes], - kwdict[:allowcomments], - kwdict[:commentmark], - kwdict[:ignorepadding], - kwdict[:skipstart], - kwdict[:skiprows], - kwdict[:skipblanks], - kwdict[:encoding]) - - p = DataFrames.ParsedCSV(Array(Uint8, nbytes), Array(Int, 1), Array(Int, 1), BitArray(1)) - - tbl = DataFrames.readtable!(p, bio, nbytes, po) - tbl -end - -as_dataframe(A::Array) = DataFrame(A) - -Block(dt::DDataFrame) = Block(dt, dt.rrefs, dt.procs, as_it_is, as_it_is) - -function _check_readtable_kwargs(kwargs...) - kwargs = {kwargs...} - for kw in kwargs - (kw[1] in [:skipstart, :skiprows]) && error("dreadtable does not support $(kw[1])") - end - for (idx,kw) in enumerate(kwargs) - if (kw[1]==:header) - (kw[2] != false) && error("dreadtable does not support reading of headers") - splice!(kwargs, idx) - break - end - end - push!(kwargs, (:header,false)) - kwargs -end - -function dreadtable(b::Block; kwargs...) - kwargs = _check_readtable_kwargs(kwargs...) - if (b.affinity == Blocks.no_affinity) - b.affinity = [[w] for w in workers()] - end - rrefs = pmap(x->as_dataframe(x;kwargs...), b; fetch_results=false) - procs = map(x->x.where, rrefs) - DDataFrame(rrefs, procs) -end -dreadtable(fname::String; kwargs...) = dreadtable(Block(Base.FS.File(fname)) |> as_io |> as_recordio; kwargs...) -function dreadtable(io::Union(Base.AsyncStream,IOStream), chunk_sz::Int, merge_chunks::Bool=true; kwargs...) - b = (Block(io, chunk_sz, '\n') .> as_recordio) .> as_bytearray - rrefs = pmap(x->as_dataframe(PipeBuffer(x); kwargs...), b; fetch_results=false) - procs = map(x->x.where, rrefs) - - if merge_chunks - uniqprocs = unique(procs) - collected_refs = map(proc->rrefs[find(x->(x==proc), procs)], uniqprocs) - merging_block = Block(collected_refs, collected_refs, uniqprocs, as_it_is, as_it_is) - - vcat_refs = pmap(reflist->vcat([fetch(x) for x in reflist]...), merging_block; fetch_results=false) - rrefs = vcat_refs - procs = uniqprocs - end - - DDataFrame(rrefs, procs) -end - - -## -# describe for ddataframe -# approximate median and quantile calculation -# not very efficient as it is an iterative process -function _randcolval(t, colname, minv, maxv) - ex = :(minv .< colname .< maxv) - ex.args[1] = minv - ex.args[3] = symbol(colname) - ex.args[5] = maxv - md = t[ex,symbol(colname)] - (length(md) == 0) && (return []) - return md[rand(1:length(md))] -end - -function _count_col_seps(t::DataFrame, colname, v) - nlt = ngt = 0 - col = t[colname] - for idx in 1:nrow(t) - isna(col[idx]) && continue - (col[idx] > v) && (ngt += 1) - (col[idx] < v) && (nlt += 1) - end - nlt,ngt -end - -function _count_col_seps(dt::DDataFrame, colname, v) - f = let colname=colname,v=v - (t)->_count_col_seps(fetch(t), colname, v) - end - nltgt = pmap(f, Block(dt)) - nlt = ngt = 0 - for (n1lt,n1gt) in nltgt - nlt += n1lt - ngt += n1gt - end - (nlt,ngt) -end - -function _num_na(t, cnames) - colnames = collect(cnames) - nrows = nrow(t) - cnts = zeros(Int,length(colnames)) - c = t[colnames] - for cidx in 1:length(colnames) - cc = c[cidx] - cnt = 0 - for idx in 1:nrows - isna(cc[idx]) && (cnt += 1) - end - cnts[cidx] = cnt - end - cnts -end - -function _colranges(t::DataFrame, cnames) - colnames = collect(cnames) - nrows = nrow(t) - ncols = length(colnames) - mins = cell(ncols) - maxs = cell(ncols) - sums = zeros(ncols) - numvalids = zeros(Int,ncols) - for cidx in 1:ncols - _min = _max = NA - _sum = 0 - _numvalid = 0 - cc = t[colnames[cidx]] - for idx in 1:nrows - ccval = cc[idx] - if !isna(ccval) - (isna(_min) || (_min > ccval)) && (_min = ccval) - (isna(_max) || (_max < ccval)) && (_max = ccval) - _sum += ccval - _numvalid += 1 - end - end - mins[cidx] = _min - maxs[cidx] = _max - sums[cidx] = _sum - numvalids[cidx] = _numvalid - end - mins,maxs,sums,numvalids -end - -function _colranges(dt::DDataFrame, cnames) - f = let cnames=cnames - (t)->_colranges(fetch(t), cnames) - end - ret = pmap(f, Block(dt)) - allmins = map(x->x[1], ret) - allmaxs = map(x->x[2], ret) - allsums = map(x->x[3], ret) - allnumvalids = map(x->x[4], ret) - ncols = length(cnames) - - mins = {} - maxs = {} - sums = {} - numvalids = {} - for cidx in 1:ncols - push!(mins, mapreduce(x->x[cidx], min, allmins)) - push!(maxs, mapreduce(x->x[cidx], max, allmaxs)) - push!(sums, mapreduce(x->x[cidx], +, allsums)) - push!(numvalids, mapreduce(x->x[cidx], +, allnumvalids)) - end - mins,maxs,(sums./numvalids),numvalids -end - -function _sorted_col_vals_at_pos(dt::DDataFrame, col, numvalid, minv, maxv, pos) - (isna(minv) || isna(maxv)) && (return NA) - posr = numvalid - pos - - while true - # get a random value between min and max for col - f = let col=col,minv=minv,maxv=maxv - (t)->_randcolval(fetch(t), col, minv, maxv) - end - pivots = pmap(f, Block(dt)) - pivots = filter(x->(x != []), pivots) - # there's no other value between minv and maxv. take pivot as one of min and max - (length(pivots) == 0) && (pivots = [minv, maxv]) - pivot = pivots[rand(1:length(pivots))][1] - - #println("pivot $pivot chosen from: $pivots") - - nrowslt,nrowsgt = _count_col_seps(dt, col, pivot) - - #println("for $(col) => $(minv):$(maxv). rowdist: $(nrowslt) - $(pos) - $(nrowsgt)") - if (nrowslt <= pos) && (nrowsgt <= posr) - return pivot - elseif (nrowsgt > posr) - minv = pivot - else # (nrowslt > pos) - maxv = pivot - end - end -end - -function _dquantile(dt::DDataFrame, cname, numvalid, minv, maxv, q) - qpos = quantile([1:numvalid], q) - lo = ifloor(qpos) - hi = iceil(qpos) - - local retvals::Array - - if lo == hi - retval = _sorted_col_vals_at_pos(dt, cname, numvalid, minv, maxv, lo) - else - retval1 = _sorted_col_vals_at_pos(dt, cname, numvalid, minv, maxv, lo) - retval2 = _sorted_col_vals_at_pos(dt, cname, numvalid, minv, maxv, hi) - retval = (retval1 + (retval2-retval1)*(qpos-lo)) - end - retval -end - -describe(dt::DDataFrame) = describe(STDOUT, dt) -function describe(io, dt::DDataFrame) - nrows = nrow(dt) - cnames = colnames(dt) - ctypes = coltypes(dt) - qcolnames = String[] - for idx in 1:length(cnames) - ((ctypes[idx] <: Number)) && push!(qcolnames, cnames[idx]) - end - - numnas = pmapreduce(x->_num_na(fetch(x), cnames), +, Block(dt)) - qcols = Dict() - if !isempty(qcolnames) - mins,maxs,means,numvalids = _colranges(dt, qcolnames) - - for idx in 1:length(qcolnames) - q1 = _dquantile(dt, qcolnames[idx], numvalids[idx], mins[idx], maxs[idx], 0.25) - q2 = _dquantile(dt, qcolnames[idx], numvalids[idx], mins[idx], maxs[idx], 0.5) - q3 = _dquantile(dt, qcolnames[idx], numvalids[idx], mins[idx], maxs[idx], 0.75) - - qcols[qcolnames[idx]] = {mins[idx], q1, q2, means[idx], q3, maxs[idx]} - end - end - - statNames = ["Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max"] - for idx in 1:length(cnames) - println(cnames[idx]) - if numnas[idx] == nrows - println(io, " * All NA * ") - continue - end - - if (ctypes[idx] <: Number) - statVals = qcols[cnames[idx]] - for i = 1:6 - println(io, string(rpad(statNames[i], 8, " "), " ", string(statVals[i]))) - end - else - println(io, "Length $(nrows)") - println(io, "Type $(ctypes[idx])") - end - - println(io, "NAs $(numnas[idx])") - println(io, "NA% $(round(numnas[idx]*100/nrows, 2))%") - println(io, "") - end -end - - -## -# indexing into DDataFrames -function Base.getindex(dt::DDataFrame, col_ind::DataFrames.ColumnIndex) - rrefs = pmap(x->getindex(fetch(x), col_ind), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) -end -function Base.getindex{T <: DataFrames.ColumnIndex}(dt::DDataFrame, col_inds::AbstractVector{T}) - rrefs = pmap(x->getindex(fetch(x), col_inds), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) -end - -# Operations on Distributed DataFrames -# TODO: colmedians, colstds, colvars, colffts, colnorms - -for f in [DataFrames.elementary_functions, DataFrames.unary_operators, :copy, :deepcopy, :isfinite, :isnan] - @eval begin - function ($f)(dt::DDataFrame) - rrefs = pmap(x->($f)(fetch(x)), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - end -end - -for f in [:without] - @eval begin - function ($f)(dt::DDataFrame, p1) - rrefs = pmap(x->($f)(fetch(x), p1), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - end -end - -with(dt::DDataFrame, c::Expr) = vcat(pmap(x->with(fetch(x), c), Block(dt))...) -with(dt::DDataFrame, c::Symbol) = vcat(pmap(x->with(fetch(x), c), Block(dt))...) - -function Base.delete!(dt::DDataFrame, c) - pmap(x->begin delete!(fetch(x),c); nothing; end, Block(dt)) - _dims(dt, false, true) -end - -function deleterows!(dt::DDataFrame, keep_inds::Vector{Int}) - # split keep_inds based on index ranges - split_inds = {} - beg_row = 1 - for idx in 1:length(dt.nrows) - end_row = dt.nrows[idx] - part_rows = filter(x->(beg_row <= x <= (beg_row+end_row-1)), keep_inds) .- (beg_row-1) - push!(split_inds, remotecall_wait(dt.procs[idx], DataFrame, part_rows)) - beg_row = end_row+1 - end - dt_keep_inds = DDataFrame(split_inds, dt.procs) - - pmap((x,y)->begin DataFrames.deleterows!(fetch(x),y[1].data); nothing; end, Block(dt), Block(dt_keep_inds)) - _dims(dt, true, false) -end - -function within!(dt::DDataFrame, c::Expr) - pmap(x->begin within!(fetch(x),c); nothing; end, Block(dt)) - _dims(dt, false, true) -end - -for f in (:(DataArrays.isna), :complete_cases) - @eval begin - function ($f)(dt::DDataFrame) - vcat(pmap(x->($f)(fetch(x)), Block(dt))...) - end - end -end -function complete_cases!(dt::DDataFrame) - pmap(x->begin complete_cases!(fetch(x)); nothing; end, Block(dt)) - _dims(dt, true, true) -end - - - -for f in DataFrames.binary_operators - @eval begin - function ($f)(dt::DDataFrame, x::Union(Number, NAtype)) - rrefs = pmap(y->($f)(fetch(y),x), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - function ($f)(x::Union(Number, NAtype), dt::DDataFrame) - rrefs = pmap(y->($f)(x,fetch(y)), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - end -end - -for (f,_) in DataFrames.vectorized_comparison_operators - for t in [:Number, :String, :NAtype] - @eval begin - function ($f){T <: ($t)}(dt::DDataFrame, x::T) - rrefs = pmap(y->($f)(fetch(y),x), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - function ($f){T <: ($t)}(x::T, dt::DDataFrame) - rrefs = pmap(y->($f)(x,fetch(y)), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) - end - end - end - @eval begin - function ($f)(a::DDataFrame, b::DDataFrame) - rrefs = pmap((x,y)->($f)(fetch(x),fetch(y)), Block(a), Block(b); fetch_results=false) - DDataFrame(rrefs, a.procs) - end - end -end - -for f in (:colmins, :colmaxs, :colprods, :colsums, :colmeans) - @eval begin - function ($f)(dt::DDataFrame) - ($f)(vcat(pmap(x->($f)(fetch(x)), Block(dt))...)) - end - end -end - -for f in DataFrames.array_arithmetic_operators - @eval begin - function ($f)(a::DDataFrame, b::DDataFrame) - # TODO: check dimensions - rrefs = pmap((x,y)->($f)(fetch(x),fetch(y)), Block(a), Block(b); fetch_results=false) - DDataFrame(rrefs, a.procs) - end - end -end - -for f in [:(Base.all), :(Base.any)] - @eval begin - function ($f)(dt::DDataFrame) - ($f)(pmap(x->($f)(fetch(x)), Block(dt))) - end - end -end - -function Base.isequal(a::DDataFrame, b::DDataFrame) - all(pmap((x,y)->isequal(fetch(x),fetch(y)), Block(a), Block(b))) -end - -nrow(dt::DDataFrame) = sum(dt.nrows) -ncol(dt::DDataFrame) = dt.ncols -DataArrays.head(dt::DDataFrame) = remotecall_fetch(dt.procs[1], x->head(fetch(x)), dt.rrefs[1]) -DataArrays.tail(dt::DDataFrame) = remotecall_fetch(dt.procs[end], x->tail(fetch(x)), dt.rrefs[end]) -colnames(dt::DDataFrame) = dt.colindex.names -function colnames!(dt::DDataFrame, vals) - pmap(x->colnames!(fetch(x), vals), Block(dt)) - names!(dt.colindex, vals) -end -function clean_colnames!(dt::DDataFrame) - new_names = map(n -> replace(n, r"\W", "_"), colnames(dt)) - colnames!(dt, new_names) - return -end - -for f in [:rename, :rename!] - @eval begin - function ($f)(dt::DDataFrame, from, to) - pmap(x->($f)(fetch(x), from, to), Block(dt); fetch_results=false) - ($f)(dt.colindex, from, to) - end - end -end - -coltypes(dt::DDataFrame) = dt.coltypes -index(dt::DDataFrame) = dt.colindex - -for f in [:vcat, :hcat, :rbind, :cbind] - @eval begin - function ($f)(dt::DDataFrame...) - rrefs = pmap((x...)->($f)([fetch(y) for y in x]...), [Block(a) for a in dt]...; fetch_results=false) - procs = dt[1].procs - DDataFrame(rrefs, procs) - end - end -end - -function Base.merge(dt::DDataFrame, t::DataFrame, bycol, jointype) - (jointype != "inner") && error("only inner joins are supported") - - rrefs = pmap((x)->merge(fetch(x),t), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) -end - -function Base.merge(t::DataFrame, dt::DDataFrame, bycol, jointype) - (jointype != "inner") && error("only inner joins are supported") - - rrefs = pmap((x)->merge(t,fetch(x)), Block(dt); fetch_results=false) - DDataFrame(rrefs, dt.procs) -end - -colwise(f::Function, dt::DDataFrame) = error("Not supported. Try colwise variant meant for DDataFrame instead.") -colwise(fns::Vector{Function}, dt::DDataFrame) = error("Not supported. Try colwise variant meant for DDataFrame instead.") -colwise(d::DDataFrame, s::Vector{Symbol}, cn::Vector) = error("Not supported. Try colwise variant meant for DDataFrame instead.") - -function colwise(f::Function, r::Function, dt::DDataFrame) - resarr = pmap((x)->colwise(f,fetch(x)), Block(dt)) - combined = hcat(resarr...) - map(x->r([combined[x, :]...]), 1:size(combined,1)) -end -function colwise(fns::Vector{Function}, rfns::Vector{Function}, dt::DDataFrame) - nfns = length(fns) - (nfns != length(rfns)) && error("number of operations must match number of reduce operations") - resarr = pmap((x)->colwise(fns,fetch(x)), Block(dt)) - combined = hcat(resarr...) - map(x->(rfns[x%nfns=1])([combined[x, :]...]), 1:size(combined,1)) -end -function colwise(dt::DDataFrame, s::Vector{Symbol}, reduces::Vector{Function}, cn::Vector) - nfns = length(s) - (nfns != length(reduces)) && error("number of operations must match number of reduce operations") - resarr = pmap((x)->colwise(fetch(x), s, cn), Block(dt)) - combined = vcat(resarr...) - resdf = DataFrame() - - for (idx,(colname,col)) in enumerate(combined) - resdf[colname] = (reduces[idx%nfns+1])(col) - end - resdf -end - -by(dt::DDataFrame, cols, f::Function) = error("Not supported. Try by variant meant for DDataFrame instead.") -by(dt::DDataFrame, cols, e::Expr) = error("Not supported. Try by variant meant for DDataFrame instead.") -by(dt::DDataFrame, cols, s::Vector{Symbol}) = error("Not supported. Try by variant meant for DDataFrame instead.") -by(dt::DDataFrame, cols, s::Symbol) = error("Not supported. Try by variant meant for DDataFrame instead.") - -function by(dt::DDataFrame, cols, f, reducer::Function) - resarr = pmap((x)->by(fetch(x), cols, f), Block(dt)) - combined = vcat(resarr...) - by(combined, cols, x->reducer(x[end])) -end - - -dwritetable(path::String, suffix::String, dt::DDataFrame; kwargs...) = pmap(x->begin; fn=joinpath(path, string(myid())*"."*suffix); writetable(fn, fetch(x); kwargs...); fn; end, Block(dt)) - -function writetable(filename::String, dt::DDataFrame, do_gather::Bool=false; kwargs...) - do_gather && (return writetable(filename, gather(dt); kwargs...)) - - hdr = (:header,true) - hdrnames = [] - for (idx,kw) in enumerate(kwargs) - (kw[1]==:header) && (hdr=splice!(kwargs, idx); break) - end - push!(kwargs, (:header,false)) - - basen = basename(filename) - path = filename[1:(length(filename)-length(basename(filename)))] - filenames = dwritetable(path, basen, dt, header=false) - - if hdr[2] - h = DataFrame() - for cns in colnames(dt) h[cns] = [] end - writetable(filename, h) - end - f = open(filename, hdr[2] ? "a" : "w") - - const lb = 1024*16 - buff = Array(Uint8, lb) - for fn in filenames - fp = open(fn) - while(!eof(fp)) - avlb = nb_available(fp) - write(f, read(fp, (avlb < lb) ? Array(Uint8, avlb) : buff)) - end - close(fp) - rm(fn) - end - close(f) -end - - diff --git a/prototypes/datastream.jl b/prototypes/datastream.jl deleted file mode 100644 index c0b61899f9..0000000000 --- a/prototypes/datastream.jl +++ /dev/null @@ -1,495 +0,0 @@ -# TODO: NonSeekableDataStream -# TODO: Remove coltypes(ds::AbstractDataStream) -# TODO: Remove colnames(ds::AbstractDataStream) -# TODO: Implement MatrixStream, which reads CSV's into a Matrix of Float64's - -export readstream - -abstract AbstractDataStream - -# Store DataFrame that will contain minibatches in stream -type SeekableDataStream <: AbstractDataStream - io::IO - p::ParsedCSV - o::ParseOptions - nrows::Int - df::DataFrame - - function SeekableDataStream(io::IO, - p::ParsedCSV, - o::ParseOptions, - nrows::Integer, - df::DataFrame) - r = new(io, p, o, int(nrows), df) - finalizer(r, r -> close(r.io)) - return r - end -end - -# TODO: Use a custom prefix-checking tester for instrings. -function readstream(pathname::String; - nrows::Integer = 1, - header::Bool = true, - separator::Char = ',', - allowquotes::Bool = true, - quotemark::Vector{Char} = ['"'], - decimal::Char = '.', - nastrings::Vector = ASCIIString["", "NA"], - truestrings::Vector = ASCIIString["T", "t", "TRUE", "true"], - falsestrings::Vector = ASCIIString["F", "f", "FALSE", "false"], - makefactors::Bool = false, - colnames::Vector = UTF8String[], - cleannames::Bool = false, - coltypes::Vector{DataType} = DataType[], - allowcomments::Bool = false, - commentmark::Char = '#', - ignorepadding::Bool = true, - skipstart::Int = 0, - skiprows::Vector{Int} = Int[], - skipblanks::Bool = true, - encoding::Symbol = :utf8, - allowescapes::Bool = true) - - io = open(pathname, "r") - nbytes = 2^20 - p = ParsedCSV(Array(Uint8, nbytes), - Array(Int, 1), - Array(Int, 1), - BitArray(1)) - o = ParseOptions(header, - separator, - #allowquotes, - quotemark, - decimal, - nastrings, - truestrings, - falsestrings, - makefactors, - colnames, - cleannames, - coltypes, - allowcomments, - commentmark, - ignorepadding, - skipstart, - skiprows, - skipblanks, - encoding, - allowescapes) - return SeekableDataStream(io, p, o, nrows, DataFrame()) -end - -function Base.show(io::IO, ds::SeekableDataStream) - @printf io "SeekableDataStream" - @printf io "Minibatch Size: %d\n" ds.nrows - return -end - -# TODO: Return nothing here? -function Base.start(s::SeekableDataStream) - seek(s.io, 0) - return nothing -end - -# TODO: Return df, nothing here? -function Base.next(s::SeekableDataStream, n::Nothing) - if position(s.io) == 0 - s.df = readtable!(s.p, s.io, s.nrows, s.o) - return s.df, nothing - else - bytes, fields, rows = readnrows!(s.p, s.io, s.nrows, s.o) - cols = fld(fields, rows) - filldf!(s.df, rows, cols, bytes, fields, s.p, s.o) - return s.df, nothing - end -end - -Base.done(s::SeekableDataStream, n::Nothing) = eof(s.io) - -############################################################################## -# -# Streaming data functions -# -############################################################################## - -function Base.sum(ds::AbstractDataStream, dim::Integer) - if dim != 2 - error("Only column sums available for AbstractDataStream") - end - - df1 = start(ds) - - df1, df1 = next(ds, df1) - - cnames = names(df1) - p = length(cnames) - sums = zeros(p) - counts = zeros(Int, p) - - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - sums[j] += c[i] - counts[j] += 1 - end - end - end - end - - while !done(ds, df1) - df1, df1 = next(ds, df1) - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - sums[j] += c[i] - counts[j] += 1 - end - end - end - end - end - - res = DataFrame({Float64 for i in 1:p}, cnames, 1) - - for j in 1:p - if counts[j] != 0 - res[1, j] = sums[j] - end - end - - return res -end - -function Base.prod(ds::AbstractDataStream, dim::Integer) - if dim != 2 - error("Only column sums available for AbstractDataStream") - end - - df1 = start(ds) - - df1, df1 = next(ds, df1) - - cnames = names(df1) - p = length(cnames) - prods = ones(p) - counts = zeros(Int, p) - - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - prods[j] *= c[i] - counts[j] += 1 - end - end - end - end - - while !done(ds, df1) - df1, df1 = next(ds, df1) - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - prods[j] *= c[i] - counts[j] += 1 - end - end - end - end - end - - res = DataFrame({Float64 for i in 1:p}, cnames, 1) - - for j in 1:p - if counts[j] != 0 - res[1, j] = prods[j] - end - end - - return res -end - -function Base.mean(ds::AbstractDataStream, dim::Integer) - if dim != 2 - error("Only column sums available for AbstractDataStream") - end - - df1 = start(ds) - - df1, df1 = next(ds, df1) - - cnames = names(df1) - p = length(cnames) - sums = zeros(p) - counts = zeros(Int, p) - - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - sums[j] += c[i] - counts[j] += 1 - end - end - end - end - - while !done(ds, df1) - df1, df1 = next(ds, df1) - n = nrow(df1) - for j in 1:p - c = df1[j] - if eltype(c) <: Real - for i in 1:n - if !isna(c[i]) - sums[j] += c[i] - counts[j] += 1 - end - end - end - end - end - - res = DataFrame({Float64 for i in 1:p}, cnames, 1) - - for j in 1:p - if counts[j] != 0 - res[1, j] = sums[j] / counts[j] - end - end - - return res -end - -# function colvars(ds::AbstractDataStream) -# p = length(colnames(ds)) -# means = zeros(p) -# deltas = zeros(p) -# m2s = zeros(p) -# vars = zeros(p) -# ns = zeros(Int, p) - -# for minibatch in ds -# for row_index in 1:nrow(minibatch) -# for column_index in 1:p -# if coltypes(minibatch)[column_index] <: Real && !isna(minibatch[row_index, column_index]) -# ns[column_index] += 1 -# deltas[column_index] = minibatch[row_index, column_index] - means[column_index] -# means[column_index] += deltas[column_index] / ns[column_index] -# m2s[column_index] = m2s[column_index] + deltas[column_index] * (minibatch[row_index, column_index] - means[column_index]) -# vars[column_index] = m2s[column_index] / (ns[column_index] - 1) -# end -# end -# end -# end - -# result_types = {Float64 for i in 1:p} -# results = DataFrame(result_types, colnames(ds), 1) - -# for column_index in 1:p -# if ns[column_index] != 0 -# results[1, column_index] = vars[column_index] -# end -# end - -# return results -# end - -# function colstds(ds::AbstractDataStream) -# vars = colvars(ds) -# stds = deepcopy(vars) -# column_types = coltypes(vars) -# for j in 1:length(column_types) -# if column_types[j] <: Real -# stds[1, j] = sqrt(vars[1, j]) -# end -# end -# return stds -# end - -# function colmins(ds::AbstractDataStream) -# p = length(colnames(ds)) -# mins = [Inf for i in 1:p] -# ns = zeros(Int, p) - -# for minibatch in ds -# for row_index in 1:nrow(minibatch) -# for column_index in 1:p -# if coltypes(minibatch)[column_index] <: Real && !isna(minibatch[row_index, column_index]) -# if minibatch[row_index, column_index] < mins[column_index] -# mins[column_index] = minibatch[row_index, column_index] -# ns[column_index] += 1 -# end -# end -# end -# end -# end - -# result_types = {Float64 for i in 1:p} -# df = DataFrame(result_types, colnames(ds), 1) - -# for column_index in 1:p -# if ns[column_index] != 0 -# df[1, column_index] = mins[column_index] -# end -# end - -# return df -# end - -# function colmaxs(ds::AbstractDataStream) -# p = length(colnames(ds)) -# maxs = [-Inf for i in 1:p] -# ns = zeros(Int, p) - -# for minibatch in ds -# for row_index in 1:nrow(minibatch) -# for column_index in 1:p -# if coltypes(minibatch)[column_index] <: Real && !isna(minibatch[row_index, column_index]) -# if minibatch[row_index, column_index] > maxs[column_index] -# maxs[column_index] = minibatch[row_index, column_index] -# ns[column_index] += 1 -# end -# end -# end -# end -# end - -# result_types = {Float64 for i in 1:p} -# df = DataFrame(result_types, colnames(ds), 1) - -# for column_index in 1:p -# if ns[column_index] != 0 -# df[1, column_index] = maxs[column_index] -# end -# end - -# return df -# end - -# function colranges(ds::AbstractDataStream) -# p = length(colnames(ds)) -# mins = [Inf for i in 1:p] -# maxs = [-Inf for i in 1:p] -# ns = zeros(Int, p) - -# for minibatch in ds -# for row_index in 1:nrow(minibatch) -# for column_index in 1:p -# if coltypes(minibatch)[column_index] <: Real && !isna(minibatch[row_index, column_index]) -# ns[column_index] += 1 -# if minibatch[row_index, column_index] < mins[column_index] -# mins[column_index] = minibatch[row_index, column_index] -# end -# if minibatch[row_index, column_index] > maxs[column_index] -# maxs[column_index] = minibatch[row_index, column_index] -# end -# end -# end -# end -# end - -# result_types = {Float64 for i in 1:p} -# df_mins = DataFrame(result_types, colnames(ds), 1) -# df_maxs = DataFrame(result_types, colnames(ds), 1) - -# for column_index in 1:p -# if ns[column_index] != 0 -# df_mins[1, column_index] = mins[column_index] -# df_maxs[1, column_index] = maxs[column_index] -# end -# end - -# return (df_mins, df_maxs) -# end - -# # Two-pass algorithm for covariance and correlation -# function cov(ds::AbstractDataStream) -# p = length(colnames(ds)) - -# # Make one pass to compute means -# means = colmeans(ds) - -# # Now compute covariances during second pass -# ns = zeros(Int, p, p) -# covariances = datazeros(p, p) - -# for minibatch in ds -# for row_index in 1:nrow(minibatch) -# for column_index in 1:p -# for alt_column_index in 1:p -# if coltypes(minibatch)[column_index] <: Real && -# !isna(minibatch[row_index, column_index]) && -# coltypes(minibatch)[alt_column_index] <: Real && -# !isna(minibatch[row_index, alt_column_index]) -# ns[column_index, alt_column_index] += 1 -# n = ns[column_index, alt_column_index] -# a = minibatch[row_index, column_index] - means[1, column_index] -# b = minibatch[row_index, alt_column_index] - means[1, alt_column_index] -# covariances[column_index, alt_column_index] = ((n - 1) / n) * covariances[column_index, alt_column_index] + (a * b) / n -# end -# end -# end -# end -# end - -# # Scale estimates by (n / (n - 1)) -# for i in 1:p -# for j in 1:p -# if ns[i, j] <= 2 -# covariances[i, j] = NA -# else -# n = ns[i, j] -# covariances[i, j] *= (n / (n - 1)) -# end -# end -# end - -# return covariances -# end - -# function cor(ds::AbstractDataStream) -# covariances = cov(ds) -# correlations = deepcopy(covariances) -# p = nrow(correlations) -# for i in 1:p -# for j in 1:p -# correlations[i, j] = covariances[i, j] / sqrt(covariances[i, i] * covariances[j, j]) -# end -# end -# return correlations -# end - -function Base.select(ds::AbstractDataStream, query::Integer) - i = 0 - for df in ds - u = nrow(df) - if i + u > query - return df[query - i, :] - end - i += u - end - error("Did not find requested row") -end - -# # TODO: Stop returning empty DataFrame at the end of a stream -# # (NOTE: Probably not possible because we don't know nrows.) -# # TODO: Implement -# # * colentropys -# # * colcardinalities -# # * colmedians -# # * colffts -# # * colnorms diff --git a/prototypes/doc/09_datastreams.md b/prototypes/doc/09_datastreams.md deleted file mode 100644 index 4642497e1a..0000000000 --- a/prototypes/doc/09_datastreams.md +++ /dev/null @@ -1,48 +0,0 @@ -# Processing Streaming Data - -In modern data analysis settings, we often need to work with streaming data -sources. This is particularly important when: - -* Data sets are too large to store in RAM -* Data sets are being generated in real time - -Julia is well-suited to both. The DataFrames package handles streaming -data by construcing an `AbstractDataStream` object, which is an iterable -object that allows programmers to work with a sequence of small -`DataFrame` objects rather than one large `DataFrame` object. - -Conventionally, these small `DataFrame` objects are called minibatches. By -default, Julia generates minibatches that contain **exactly** one row -of data, but this can be easily changed. To see how an `AbstractDataStream` -works, we'll loop over the rows of a dataset we use for benchmarking -DataFrames. - - using DataFrames - - path = Pkg.dir("DataFrames", - "test", - "data", - "scaling", - "10000rows.csv") - - ds = readstream(path) - - for df in ds - print(df) - end - -As the input makes clear, every `df` object generated by this for-loop -is a single row of the input data set. To work with larger minibatches, -we use the `nrows` keyword argument: - - ds = readstream(path, nrows = 1_000) - - for df in ds - print(df) - end - -**Note that the `df` objects generated during this for-loop are not -separate objects**: the memory used by `df` is rewritten during each -iteration of the loop to make it easier to work with large data sets. -If you need to get a separate object for each minibatch, you need to -call `copy(df)` on each `df` object generated during the loop. diff --git a/prototypes/indexing.jl b/prototypes/indexing.jl deleted file mode 100644 index 8c0ff4b3fd..0000000000 --- a/prototypes/indexing.jl +++ /dev/null @@ -1,344 +0,0 @@ -# This is an experiment in adding indexing to vectors. The idea is -# that if a DataFrame column is indexed, the following will have fast -# lookups: -# df[:(2 .< col1 .< 7), :] -# df[:( datecol .>= "2011-01-01" ), "col3"] -# df[:( col .== "red" ), :] -# -# Keeping indexing with columns has advantages: -# - Column ordering is less likely to be messed up inadvertantly. -# - Multiple columns can be indexed; there's no main key grouping. -# - More flexible logic combinations are possible. -# - You can mix and match keyed and non-keyed comparisons. -# - The DataFrame structure and indexing are not disturbed. -# - It's probably less coding and maintenance. -# The disadvantages are: -# - It's yet another vector type. -# - It's somewhat slower than having one overall set of DataFrame -# keys in a sorted DataFrame. It should still be pretty fast (no -# speed checks, yet). -# - You can't do data.table/pandas shortcuts like df["A"] for -# df[:( keycol .== "A" ), :]. (But, df["A"] is less descriptive if -# you don't know what df's keys are.) - - - -# An IndexedVector is a pointer to the original column along with an -# indexing vector equal to `order(orig)`. A comparison operation like -# `idv .> 3` returns an Indexer type. The Indexer type includes a -# pointer to the IndexedVector along with a vector of Range1's. -# DataVector's and DataFrame's can be indexed with Indexers. It's fast -# because you're using a slice of the already indexed vector. - -# Indexer's can be combined with `|` and `&`. In the case where the -# IndexedVector is the same, the Indexer is reduced or expanded as -# appropriate. This includes: `0 .< idv .< 3` or `idv .== 1 | idv .== -# 4`. If Indexers have different IndexedVectors (like `idv1 .== 1 | -# idv2 .== 1`), then the result is converted to a BitVector. - -# We handle NA's by excluding them from the index. `order` puts NA's -# at the front. We use the following function to exclude those. This -# does make the indexing a little trickier as the length of the index -# can be less than the length of the DataArray. - - -# Note that the following leaves out NA's. -indexorder(x) = sortperm(x) -function indexorder(v::AbstractDataVector) - Nna = sum(isna(v)) - sortperm(v)[Nna + 1 : end] -end - - -type IndexedVector{T,S<:AbstractVector} <: AbstractVector{T} - x::S - idx::Vector{Int} -end -IndexedVector{T}(x::AbstractVector{T}) = IndexedVector{T,typeof(x)}(x, indexorder(x)) - -Base.getindex{I<:Real}(v::IndexedVector,i::AbstractVector{I}) = IndexedVector(v.x[i]) -Base.getindex{I<:Real}(v::IndexedVector,i::I) = v.x[i] -Base.getindex(v::IndexedVector,i::Real) = v.x[i] -Base.setindex!(v::IndexedVector, val::Any, i::Real) = IndexedVector(setindex!(v.x, val, i)) -Base.setindex!(v::IndexedVector, val::Any, inds::AbstractVector) = IndexedVector(setindex!(v.x, val, inds)) -Base.reverse(v::IndexedVector) = IndexedVector(reverse(v.x), reverse(v.idx)) -Base.similar(v::IndexedVector, T, dims::Dims) = similar(v.x, T, dims) - -vecbind_type(x::IndexedVector) = vecbind_type(x.x) - -# to make assign in a DataFrame work: -upgrade_vector(v::IndexedVector) = v - - -Base.sortperm{S,A<:AbstractDataVector}(x::IndexedVector{S,A}) = [findin(x.x, [NA]), x.idx] -Base.sortperm(x::IndexedVector) = x.idx -Base.sortperm(x::IndexedVector, ::Base.Sort.ReverseOrdering) = reverse(x.idx) -Base.sort(x::IndexedVector) = x.x[sortperm(x)] -Base.sort(x::IndexedVector, ::Base.Sort.ReverseOrdering) = x.x[reverse(sortperm(x))] -Base.Perm{O<:Base.Sort.Ordering}(o::O, v::IndexedVector) = FastPerm(o, v) - -type Indexer - r::Vector{Range1} - iv::IndexedVector - function Indexer(r::Vector{Range1}, iv::IndexedVector) - for i in 1:length(r) - ri = r[i] - if length(ri) < 1 || ri[1] < 1 || ri[end] > length(iv) - delete!(r, i) - end - end - new(r, iv) - end -end - -function Base.intersect(v1::Vector{Range1}, v2::Vector{Range1}) - # Assumes that the Range1's in each vector are sorted and don't overlap. - # (Actually, it doesn't, but it should to get more speed.) - res = Range1[] - # This does more work than it needs to. - for r1 in v1 - for r2 in v2 - isoverlap = !((r1[1] > r2[end]) | (r2[1] > r1[end])) - if isoverlap - push!(res, max(r1[1], r2[1]):min(r1[end], r2[end])) - end - end - end - res -end - -function Base.union(v1::Vector{Range1}, v2::Vector{Range1}) - # Assumes that the Range1's in each vector are sorted and don't overlap. - # (Actually, it doesn't, but it should to get more speed.) - # TODO Check for zero length. - compare(v1, v2) = (v1[end][end] > v2[end][end] ? v2 : v1, - v1[end][end] > v2[end][end] ? v1 : v2) - res = Range1[] - v1 = copy(v1) # Destructively operate on these - v2 = copy(v2) - while length(v1) > 0 && length(v2) > 0 - # right part, working right to left - (left, right) = compare(v1, v2) - while length(right) > 0 && right[end][1] > left[end][end] - push!(res, pop!(right)) - end - if length(right) == 0 break; end - # overlap - r_end = max(right[end][end], left[end][end]) - overlap = false - while length(left) > 0 && length(right) > 0 && right[end][end] > left[end][1] - r_start = min(right[end][1], left[end][1]) - overlap = true - (left, right) = compare(v1, v2) - if right[end][1] >= r_start - pop!(right) - end - end - if overlap - if length(right) == 0 && length(left) > 0 && r_start <= left[end][1] - r_start = left[end][1] - pop!(left) - end - push!(res, r_start : r_end) - end - end - # Rest of v1 or v2 (no overlaps here) - while length(v1) > 0 - push!(res, pop!(v1)) - end - while length(v2) > 0 - push!(res, pop!(v2)) - end - reverse(res) -end - -function Base.(:!)(x::Indexer) # Negate the Indexer - res = Range1[1 : x.r[1][1] - 1] - for i in 1:length(x.r) - 1 - push!(res, x.r[i][end] + 1 : x.r[i + 1][1] - 1) - end - push!(res, x.r[end][end] + 1 : length(x.iv.idx)) - Indexer(res, x.iv) -end - -function Base.(:&)(x1::Indexer, x2::Indexer) - if is(x1.iv, x2.iv) - Indexer(intersect(x1.r, x2.r), x1.iv) - else - bool(x1) & bool(x2) - end -end -Base.(:&)(x1::BitVector, x2::Indexer) = x1 & bool(x2) -Base.(:&)(x1::Indexer, x2::BitVector) = x2 & bool(x1) - -function Base.(:|)(x1::Indexer, x2::Indexer) - if is(x1.iv, x2.iv) - Indexer(union(x1.r, x2.r), x1.iv) - else - bool(x1) | bool(x2) - end -end -Base.(:|)(x1::BitVector, x2::Indexer) = x1 | bool(x2) -Base.(:|)(x1::Indexer, x2::BitVector) = x2 | bool(x1) - -function Base.bool(ix::Indexer) - res = falses(length(ix.iv)) - for i in ix.iv.idx[[ix.r...]] - res[i] = true - end - res -end - -# `getindex` -- each Range1 of the Indexer (i.r...) is applied to the indexing vector (i.iv.idx) - -Base.getindex(x::IndexedVector, i::Indexer) = x[i.iv.idx[[i.r...]]] -Base.getindex(x::AbstractVector, i::Indexer) = x[i.iv.idx[[i.r...]]] -Base.getindex(x::AbstractDataVector, i::Indexer) = x[i.iv.idx[[i.r...]]] - -# df[MultiRowIndex, SingleColumnIndex] => (Sub)?AbstractDataVector -function Base.getindex(df::DataFrame, row_inds::Indexer, col_ind::ColumnIndex) - selected_column = df.colindex[col_ind] - return df.columns[selected_column][row_inds.iv.idx[[row_inds.r...]]] -end - -# df[MultiRowIndex, MultiColumnIndex] => (Sub)?DataFrame -function Base.getindex{T <: ColumnIndex}(df::DataFrame, row_inds::Indexer, col_inds::AbstractVector{T}) - selected_columns = df.colindex[col_inds] - new_columns = {dv[row_inds.iv.idx[[row_inds.r...]]] for dv in df.columns[selected_columns]} - return DataFrame(new_columns, Index(df.colindex.names[selected_columns])) -end - -typealias ComparisonTypes Union(Number, String) # defined mainly to avoid warnings - -# element-wise (in)equality operators -# these may need range checks -# Should these results be sorted? Could be a counting sort. -Base.(:.==){T<:ComparisonTypes}(a::IndexedVector{T}, v::T) = Indexer(Range1[search_sorted_first(a.x, v, a.idx) : search_sorted_last(a.x, v, a.idx)], a) -Base.(:.==){T<:ComparisonTypes}(v::T, a::IndexedVector{T}) = Indexer(Range1[search_sorted_first(a.x, v, a.idx) : search_sorted_last(a.x, v, a.idx)], a) -Base.(:.>=){T<:ComparisonTypes}(a::IndexedVector{T}, v::T) = Indexer(Range1[search_sorted_first(a.x, v, a.idx) : length(a.idx)], a) -Base.(:.<=){T<:ComparisonTypes}(a::IndexedVector{T}, v::T) = Indexer(Range1[1 : search_sorted_last(a.x, v, a.idx)], a) -Base.(:.>=){T<:ComparisonTypes}(v::T, a::IndexedVector{T}) = Indexer(Range1[1 : search_sorted_last(a.x, v, a.idx)], a) -Base.(:.<=){T<:ComparisonTypes}(v::T, a::IndexedVector{T}) = Indexer(Range1[search_sorted_first(a.x, v, a.idx) : length(a.idx)], a) -Base.(:.>){T<:ComparisonTypes}(a::IndexedVector{T}, v::T) = Indexer(Range1[search_sorted_first_gt(a.x, v, a.idx) : length(a.idx)], a) -Base.(:.<){T<:ComparisonTypes}(a::IndexedVector{T}, v::T) = Indexer(Range1[1 : search_sorted_last_lt(a.x, v, a.idx)], a) -Base.(:.<){T<:ComparisonTypes}(v::T, a::IndexedVector{T}) = Indexer(Range1[search_sorted_first_gt(a.x, v, a.idx) : length(a.idx)], a) -Base.(:.>){T<:ComparisonTypes}(v::T, a::IndexedVector{T}) = Indexer(Range1[1 : search_sorted_last_lt(a.x, v, a.idx)], a) - - -function search_sorted_first_gt{I<:Integer}(a::AbstractVector, x, idx::AbstractVector{I}) - res = search_sorted_last(a, x, idx) - if res == 0 return 1 end - if res == length(a) && a[idx[res]] != x return(length(a)+1) end - a[idx[res]] == x ? res + 1 : res -end -function search_sorted_last_lt{I<:Integer}(a::AbstractVector, x, idx::AbstractVector{I}) - res = search_sorted_first(a, x, idx) - if res > length(idx) return length(idx) end - if res == 1 && a[idx[res]] != x return(0) end - a[idx[res]] == x ? res - 1 : res -end - -Base.findin(a::IndexedVector,r::Range1) = findin(a, [r]) -Base.findin(a::IndexedVector,v::Real) = findin(a, [v]) -function Base.findin(a::IndexedVector, b::AbstractVector) - ## Returns an Indexer with the elements in "a" that appear in "b" - res = a .== b[1] - for i in 2:length(b) - res = res | (a .== b[i]) - end - res -end - -Base.size(a::IndexedVector) = size(a.x) -Base.length(a::IndexedVector) = length(a.x) -Base.ndims(a::IndexedVector) = 1 -Base.eltype(a::IndexedVector) = eltype(a.x) - -## print(io, a::IndexedVector) = print(io, a.x) -function Base.show(io::IO, a::IndexedVector) - print(io, "IndexedVector: ") - show(io, a.x) -end -function Base.repl_show(io::IO, a::IndexedVector) - print(io, "IndexedVector: ") - repl_show(io, a.x) -end - -function search_sorted_last{I<:Integer}(a::AbstractVector, x, idx::AbstractVector{I}) - ## Index of the last value of vector a that is less than or equal to x. - ## Returns 0 if x is less than all values of a. - ## idx is an indexing vector equal in length to a that sorts a - ## @assert length(a) == length(idx) - lo = 0 - hi = length(idx) + 1 - while lo < hi-1 - i = (lo+hi)>>>1 - if isless(x,a[idx[i]]) - hi = i - else - lo = i - end - end - lo -end - -function search_sorted_first{I<:Integer}(a::AbstractVector, x, idx::AbstractVector{I}) - ## Index of the first value of vector a that is greater than or equal to x. - ## Returns length(a) + 1 if x is greater than all values in a. - ## idx is an indexing vector equal in length to a that sorts a - ## @assert length(a) == length(idx) - lo = 0 - hi = length(idx) + 1 - while lo < hi-1 - i = (lo+hi)>>>1 - if isless(a[idx[i]],x) - lo = i - else - hi = i - end - end - hi -end - - - -# the following is needed for show(df) -maxShowLength(v::IndexedVector) = length(v) > 0 ? maximum([length(_string(x)) for x = v.x]) : 0 - -# Methods to speed up grouping and merging -function DataArrays.PooledDataArray{R}(d::IndexedVector, ::Type{R}) - refs = zeros(R, size(d)) - oneval = one(R) - local idx::Int - ## local lastval::T - local poolidx::R - pool = Array(eltype(d), 0) - # skip over NAs - nna = length(d) - length(d.x) - if nna == length(d) - return PooledDataArray(DataArrays.RefArray(refs), pool) - end - lastval = d.x[d.idx[nna+1]] - push!(pool, d.x[d.idx[nna+1]]) - poolidx = oneval - for i = 1 : length(d.idx) - idx = d.idx[i] - val = d.x[idx] - if val != lastval - push!(pool, val) - poolidx += oneval - lastval = val - end - refs[idx] = poolidx - end - return PooledDataArray(DataArrays.RefArray(refs), pool) -end -DataArrays.PooledDataArray(d::IndexedVector) = PooledDataArray(d, DEFAULT_POOLED_REF_TYPE) - -DataArrays.DataArray(d::IndexedVector) = DataArray(x.x) - -function DataArrays.PooledDataVecs{S}(v1::IndexedVector{S}, - v2::IndexedVector{S}) - return PooledDataVecs(PooledDataArray(v1), - PooledDataArray(v2)) -end diff --git a/prototypes/namedarray.jl b/prototypes/namedarray.jl deleted file mode 100644 index 11b0130aef..0000000000 --- a/prototypes/namedarray.jl +++ /dev/null @@ -1,62 +0,0 @@ - -# A NamedArray is like a list in R or a DataFrame in Julia without the -# requirement that columns be of equal length. The main reason for its -# existence is to allow creation of a DataFrame from unequal column -# lengths like the following: -# DataFrame(quote -# a = 1 -# b = [1:5] -# c = [1:10] -# end) -type NamedArray <: Associative{Any,Any} - data::Vector{Any} - idx::AbstractIndex - function NamedArray(data::Vector, idx::AbstractIndex) - if length(idx) != length(data) - error("index/names must be the same length as the data") - end - new(data, idx) - end -end -NamedArray() = NamedArray({}, Index()) - -Base.length(x::NamedArray) = length(x.idx) -Base.names(x::NamedArray) = names(x.idx) - -Base.getindex(x::NamedArray, c) = x[x.idx[c]] -Base.getindex(x::NamedArray, c::Integer) = x.data[c] -Base.getindex(x::NamedArray, c::Vector{Int}) = NamedArray(x.data[c], names(x)[c]) - -function Base.setindex!(x::NamedArray, newdata, ipos::Integer) - if ipos > 0 && ipos <= length(x) - x.data[ipos] = newdata - else - throw(ArgumentError("Can't replace a non-existent array position")) - end - x -end -function Base.setindex!(x::NamedArray, newdata, name) - ipos = get(x.idx.lookup, name, 0) - if ipos > 0 - # existing - setindex!(x, newdata, ipos) - else - # new - push!(x.idx, name) - push!(x.data, newdata) - end - x -end - - -# Associative methods: -Base.has(x::NamedArray, key) = has(x.idx, key) -Base.get(x::NamedArray, key, default) = has(x, key) ? x[key] : default -Base.keys(x::NamedArray) = keys(x.idx) -Base.values(x::NamedArray) = x.data -# Collection methods: -Base.start(x::NamedArray) = 1 -Base.done(x::NamedArray, i) = i > length(x.data) -Base.next(x::NamedArray, i) = ((x.idx.names[i], x[i]), i + 1) -Base.length(x::NamedArray) = length(x.data) -Base.isempty(x::NamedArray) = length(x.data) == 0 diff --git a/prototypes/test_dataframe_blocks.jl b/prototypes/test_dataframe_blocks.jl deleted file mode 100644 index eecfdf77af..0000000000 --- a/prototypes/test_dataframe_blocks.jl +++ /dev/null @@ -1,36 +0,0 @@ -module TestDDataFrame - using Base.Test - using DataArrays - using DataFrames - - const datafile = joinpath(dirname(@__FILE__), "data", "distributed", "test.csv") - const nloops = 10 - - function load_pkgs() - println("loading packages...") - @everywhere using Blocks - @everywhere using DataFrames - end - - if nprocs() < 4 - addwrkrs = 4 - nprocs() - println("adding $addwrkrs more processors...") - addprocs(addwrkrs) - end - println("\tnprocs: $(nprocs())") - load_pkgs() - - df = dreadtable(datafile, header=false) - names(df) - names!(df, ["c1","c2","c3","c4","c5","c6","c7","c8","c9","c10"]) - - sum_result = df+df - mul_result = 2*df - eq_result = (sum_result .== mul_result) - @assert all(eq_result) - - df1 = dreadtable(open(datafile), 1000) - @assert nrow(df1) == nrow(df) - @assert ncol(df1) == ncol(df) - @assert isapprox(sum(matrix(colsums(df1))), sum(matrix(colsums(df)))) -end diff --git a/prototypes/test_datastream.jl b/prototypes/test_datastream.jl deleted file mode 100644 index 33f4eecc11..0000000000 --- a/prototypes/test_datastream.jl +++ /dev/null @@ -1,118 +0,0 @@ -module TestDataStream - using Base.Test - using DataArrays - using DataFrames - - path = Pkg.dir("DataFrames", - "test", - "data", - "separators", - "sample_data.csv") - - ds = readstream(path, nrows = 1) - n = start(ds) - (df, n) = next(ds, n) - @assert done(ds, n) == false - (df, n) = next(ds, n) - @assert done(ds, n) == false - (df, n) = next(ds, n) - @assert done(ds, n) == true - - path = Pkg.dir("DataFrames", - "test", - "data", - "scaling", - "10000rows.csv") - - ds = readstream(path, nrows = 100) - n = start(ds) - (df, n) = next(ds, n) - @assert done(ds, n) == false - - ds = readstream(path, nrows = 5) - - for minibatch in ds - @assert isa(minibatch, DataFrame) - end - - # means = colmeans(ds) - # @assert abs(means[1, 4] - (-0.005686449)) < 10e-4 - # @assert abs(means[1, 5] - 19.01197) < 10e-4 - - # vars = colvars(ds) - # @assert abs(vars[1, 4] - 0.98048) < 10e-4 - # @assert abs(vars[1, 5] - 0.989416) < 10e-4 - - # (mins, maxs) = colranges(ds) - # @assert abs(mins[1, 4] - (-4.33635)) < 10e-4 - # @assert abs(mins[1, 5] - 15.6219) < 10e-4 - # @assert abs(maxs[1, 4] - 3.86857) < 10e-4 - # @assert abs(maxs[1, 5] - 22.574) < 10e-4 - - # covariances = cov(ds) - # for i in 1:ncol(covariances) - # if isna(covariances[i, i]) - # @assert isna(vars[1, i]) - # else - # @assert abs(covariances[i, i] - vars[1, i]) < 10e-4 - # end - # end - # @assert abs(covariances[4, 4] - 0.980479916) < 10e-4 - # @assert abs(covariances[4, 5] - 0.009823644) < 10e-4 - # @assert abs(covariances[5, 4] - 0.009823644) < 10e-4 - # @assert abs(covariances[5, 5] - 0.989415811) < 10e-4 - - # correlations = cor(ds) - # for i in ncol(correlations) - # for j in ncol(correlations) - # true_value = covariances[i, j] / sqrt(covariances[i, i] * covariances[j, j]) - # if isna(true_value) - # @assert isna(correlations[i, j]) - # else - # @assert abs(correlations[i, j] - true_value) < 10e-4 - # end - # end - # end - - # Deal with different delimiters - path = Pkg.dir("DataFrames", - "test", - "data", - "separators", - "sample_data.csv") - ds = readstream(path) - for row in ds - @assert size(row, 1) <= 1 - end - - path = Pkg.dir("DataFrames", - "test", - "data", - "separators", - "sample_data.tsv") - ds = readstream(path) - for row in ds - @assert size(row, 1) <= 1 - end - - path = Pkg.dir("DataFrames", - "test", - "data", - "separators", - "sample_data.wsv") - ds = readstream(path) - for row in ds - @assert size(row, 1) <= 1 - end - - # # DataFrame to DataStream conversion - # df = DataFrame(A = 1:25) - - # ds = DataStream(df, 5) - - # for mini in ds - # @assert nrow(mini) <= 5 - # end - - # colmeans(ds) -end diff --git a/prototypes/test_indexing.jl b/prototypes/test_indexing.jl deleted file mode 100644 index f28e005096..0000000000 --- a/prototypes/test_indexing.jl +++ /dev/null @@ -1,39 +0,0 @@ -module TestIndexedVector - using Base.Test - using DataArrays - using DataFrames - - # - # IndexedVector tests - # - - srand(1) - a = DataArray(rand(1:5,20)) - a[1:2] = NA - ia = IndexedVector(a) - b = DataArray(rand(5:8,20)) - ib = IndexedVector(b) - - - ia .== 4 - v = [1:20] - @assert v[ia .== 4] == v[a .== 4] - @assert sort(v[(ia .== 4) | (ia .== 5)]) == v[(a .== 4) | (a .== 5)] - @assert sort(v[(ia .>= 4) & (ia .== 5)]) == v[(a .>= 4) & (a .== 5)] - @assert sort(v[!(ia .== 4)]) == v[!(a .== 4)] - @assert sort(v[findin(ia, [3:6])]) == v[findin(a, [3:6])] - @assert sort(v[(ia .== 4) | (ib .== 6)]) == v[(a .== 4) | (b .== 6)] - - - df = DataFrame(quote - x1 = IndexedVector(vcat(fill([1:5],4)...)) - x2 = IndexedVector(vcat(fill(letters[1:10],2)...)) - end) - - df[:(x2 .== "a"), :] - df[:( (x2 .== "a") | (x1 .== 2) ), :] - df[:( ("b" .<= x2 .<= "c") | (x1 .== 5) ), :] - df[:( (x1 .== 1) & (x2 .== "a") ), :] - - df[findin(df["x2"], ["c","e","X"]), :] -end diff --git a/spec/FunctionReference.md b/spec/FunctionReference.md deleted file mode 100644 index 06b189cbd3..0000000000 --- a/spec/FunctionReference.md +++ /dev/null @@ -1,465 +0,0 @@ -# DataFrames.jl Package - -## DataFrames - -#### `DataFrame(cols::Vector, colnames::Vector{ByteString})` - -Construct a DataFrame from the columns given by `cols` with the index -generated by `colnames`. A DataFrame inherits from -`Associative{Any,Any}`, so Associative operations should work. Columns -are vector-like objects. Normally these are AbstractDataVector's (DataVector's -or PooledDataVector's), but they can also (currently) include standard -Julia Vectors. - -#### `DataFrame(cols::Vector)` - -Construct a DataFrame from the columns given by `cols` with default -column names. - -#### `DataFrame()` - -An empty DataFrame. - -#### `copy(df::DataFrame)` - -A shallow copy of `df`. Columns are referenced, not copied. - -#### `deepcopy(df::DataFrame)` - -A deep copy of `df`. Copies of each column are made. - -#### `similar(df::DataFrame, nrow)` - -A new DataFrame with `nrow` rows and the same column names and types as `df`. - - -### Basics - -#### `size(df)`, `ndims(df)` - -Same meanings as for Arrays. - -#### `has(df, key)`, `get(df, key, default)`, `keys(df)`, and `values(df)` - -Same meanings as Associative operations. `keys` are column names; -`values` are column contents. - -#### `start(df)`, `done(df,i)`, and `next(df,i)` - -Methods to iterate over columns. - -#### `ncol(df::AbstractDataFrame)` - -Number of columns in `df`. - -#### `nrow(df::AbstractDataFrame)` - -Number of rows in `df`. - -#### `length(df::AbstractDataFrame)` - -Number of columns in `df`. - -#### `isempty(df::AbstractDataFrame)` - -Whether the number of columns equals zero. - -#### `head(df::AbstractDataFrame)` and `head(df::AbstractDataFrame, i::Int)` - -First `i` rows of `df`. Defaults to 6. - -#### `tail(df::AbstractDataFrame)` and `tail(df::AbstractDataFrame, i::Int)` - -Last `i` rows of `df`. Defaults to 6. - -#### `show(io, df::AbstractDataFrame)` - -Standard pretty-printer of `df`. Called by `print()` and the REPL. - -#### `dump(df::AbstractDataFrame)` - -Show the structure of `df`. Like R's `str`. - -#### `describe(df::AbstractDataFrame)` - -Show a summary of each column of `df`. - -#### `complete_cases(df::AbstractDataFrame)` - -A Vector{Bool} of indexes of complete cases in `df` (rows with no -NA's). - -#### `duplicated(df::AbstractDataFrame)` - -A Vector{Bool} of indexes indicating rows that are duplicates of prior -rows. - -#### `unique(df::AbstractDataFrame)` - -DataFrame with unique rows in `df`. - - -### Indexing, Assignment, and Concatenation - -DataFrames are indexed like a Matrix and like an Associative. Columns -may be indexed by column name. Rows do not have names. Referencing -with one argument normally indexes by columns: `df["col"]`, -`df[["col1","col3"]]` or `df[i]`. With two arguments, rows and columns -are selected. Indexing along rows works like Matrix indexing. Indexing -along columns works like Matrix indexing with the addition of column -name access. - -#### `getindex(df::DataFrame, ind)` or `df[ind]` - -Returns a subset of the columns of `df` as specified by `ind`, which -may be an `Int`, a `Range`, a `Vector{Int}`, `ByteString`, or -`Vector{ByteString}`. Columns are referenced, not copied. For a -single-element `ind`, the column by itself is returned. - -#### `getindex(df::DataFrame, irow, icol)` or `df[irow,icol]` - -Returns a subset of `df` as specified by `irow` and `icol`. `irow` may -be an `Int`, a `Range`, or a `Vector{Int}`. `icol` may be an `Int`, a -`Range`, or a `Vector{Int}`, `ByteString`, or, `ByteString`, or -`Vector{ByteString}`. For a single-element `ind`, the column subset by -itself is returned. - -#### `index(df::DataFrame)` - -Returns the column `Index` for `df`. - -#### `set_group(df::DataFrame, newgroup, names::Vector{ByteString})` -#### `get_groups(df::DataFrame)` -#### `set_groups(df::DataFrame, gr::Dict)` - -See the Indexing section for these operations on column indexes. - -#### `colnames(df::DataFrame)` or `names(df::DataFrame)` - -The column names as an `Array{ByteString}` - -#### `setindex!(df::DataFrame, newcol, colname)` or `df[colname] = newcol` - -Replace or add a new column with name `colname` and contents `newcol`. -Arrays are converted to DataVector's. Values are recycled to match the -number of rows in `df`. - -#### `insert!(df::DataFrame, index::Integer, item, name)` - -Insert a column of name `name` and with contents `item` into `df` at -position `index`. - -#### `insert!(df::DataFrame, df2::DataFrame)` - -Insert columns of `df2` into `df1`. - -#### `del!(df::DataFrame, cols)` - -Delete columns in `df` at positions given by `cols` (noted with any -means that columns can be referenced). - -#### `del(df::DataFrame, cols)` - -Nondestructive version. Return a DataFrame based on the columns in -`df` after deleting columns specified by `cols`. - -#### `cbind(df1, df2, ...)` or `hcat(df1, df2, ...)` or `[df1 df2 ...]` - -Concatenate columns. Duplicated column names are adjusted. - -#### `rbind(df1, df2, ...)` or `vcat(df1, df2, ...)` or `[df1, df2, ...]` - -Concatenate rows. - -### I/O - -#### `csvDataFrame(filename, o::Options)` - -Return a DataFrame from file `filename`. Options `o` include -`colnames` [`"true"`, `"false"`, or `"check"` (the default)] and -`poolstrings` [`"check"` (default) or `"never"`]. - -### Expression/Function Evaluation in a DataFrame - -#### `with(df::AbstractDataFrame, ex::Expr)` - -Evaluate expression `ex` with the columns in `df`. - -#### `within(df::AbstractDataFrame, ex::Expr)` - -Return a copy of `df` after evaluating expression `ex` with the -columns in `df`. - -#### `within!(df::AbstractDataFrame, ex::Expr)` - -Modify `df` by evaluating expression `ex` with the columns in `df`. - -#### `based_on(df::AbstractDataFrame, ex::Expr)` - -Return a new DataFrame based on evaluating expression `ex` with the -columns in `df`. Often used for summarizing operations. - -#### `colwise(f::Function, df::AbstractDataFrame)` -#### `colwise(f::Vector{Function}, df::AbstractDataFrame)` - -Apply `f` to each column of `df`, and return the results as an -Array{Any}. - -#### `colwise(df::AbstractDataFrame, s::Symbol)` -#### `colwise(df::AbstractDataFrame, s::Vector{Symbol})` - -Apply the function specified by Symbol `s` to each column of `df`, and -return the results as a DataFrame. - -### SubDataFrames - -#### `sub(df::DataFrame, r, c)` -#### `sub(df::DataFrame, r)` - -Return a SubDataFrame with references to rows and columns of `df`. - - -#### `sub(sd::SubDataFrame, r, c)` -#### `sub(sd::SubDataFrame, r)` - -Return a SubDataFrame with references to rows and columns of `df`. - -#### `getindex(sd::SubDataFrame, r, c)` or `sd[r,c]` -#### `getindex(sd::SubDataFrame, c)` or `sd[c]` - -Referencing should work the same as DataFrames. - - -### Grouping - -#### `groupby(df::AbstractDataFrame, cols)` - -Return a GroupedDataFrame based on unique groupings indicated by the -columns with one or more names given in `cols`. - -#### `start(gd)`, `done(gd,i)`, and `next(gd,i)` - -Methods to iterate over GroupedDataFrame groupings. - -#### `getindex(gd::GroupedDataFrame, idx)` or `gd[idx]` - -Reference a particular grouping. Referencing returns a SubDataFrame. - -#### `with(gd::GroupedDataFrame, ex::Expr)` - -Evaluate expression `ex` with the columns in `gd` in each grouping. - -#### `within(gd::GroupedDataFrame, ex::Expr)` -#### `within!(gd::GroupedDataFrame, ex::Expr)` - -Return a DataFrame with the results of evaluating expression `ex` with -the columns in `gd` in each grouping. - -#### `based_on(gd::GroupedDataFrame, ex::Expr)` - -Sweeps along groups and applies `based_on` to each group. Returns a -DataFrame. - -#### `map(f::Function, gd::GroupedDataFrame)` - -Apply `f` to each grouping of `gd` and return the results in an Array. - -#### `colwise(f::Function, gd::GroupedDataFrame)` -#### `colwise(f::Vector{Function}, gd::GroupedDataFrame)` - -Apply `f` to each column in each grouping of `gd`, and return the -results as an Array{Any}. - -#### `colwise(gd::GroupedDataFrame, s::Symbol)` -#### `colwise(gd::GroupedDataFrame, s::Vector{Symbol})` - -Apply the function specified by Symbol `s` to each column of in each -grouping of `gd`, and return the results as a DataFrame. - -#### `by(df::AbstractDataFrame, cols, s::Symbol)` or `groupby(df, cols) | s` -#### `by(df::AbstractDataFrame, cols, s::Vector{Symbol})` - -Return a DataFrame with the results of grouping on `cols` and -`colwise` evaluation based on `s`. Equivalent to `colwise(groupby(df, -cols), s)`. - -#### `by(df::AbstractDataFrame, cols, e::Expr)` or `groupby(df, cols) | e` - -Return a DataFrame with the results of grouping on `cols` and -evaluation of `e` in each grouping. Equivalent to `based_on(groupby(df, -cols), e)`. - -### Reshaping / Merge - -#### `stack(df::DataFrame, cols)` - -For conversion from wide to long format. Returns a DataFrame with -stacked columns indicated by `cols`. The result has column `"key"` -with column names from `df` and column `"value"` with the values from -`df`. Columns in `df` not included in `cols` are duplicated along the -stack. - -#### `unstack(df::DataFrame, ikey, ivalue, irefkey)` - -For conversion from long to wide format. Returns a DataFrame. `ikey` -indicates the key column--unique values in column `ikey` will be -column names in the result. `ivalue` indicates the value column. -`irefkey` is the column with a unique identifier for that . Columns -not given by `ikey`, `ivalue`, or `irefkey` are currently ignored. - -#### `merge(df1::DataFrame, df2::DataFrame, bycol)` -#### `merge(df1::DataFrame, df2::DataFrame, bycol, jointype)` - -Return the database join of `df1` and `df2` based on the column `bycol`. -Currently only a single merge key is supported. Supports `jointype` of -"inner" (the default), "left", "right", or "outer". - - -## Index - -#### `Index()` -#### `Index(s::Vector{ByteString})` - -An Index with names `s`. An Index is like an Associative type. An -Index is used for column indexing of DataFrames. An Index maps -ByteStrings and Vector{ByteStrings} to Indices. - -#### `length(x::Index)`, `copy(x::Index)`, `has(x::Index, key)`, `keys(x::Index)`, `push!(x::Index, name)` - -Normal meanings. - -#### `del(x::Index, idx::Integer)`, `del(x::Index, s::ByteString)`, - -Delete the name `s` or name at position `idx` in `x`. - -#### `names(x::Index)` - -A Vector{ByteString} with the names of `x`. - -#### `names!(x::Index, nm::Vector{ByteString})` - -Set names `nm` in `x`. - -#### `rename(x::Index, f::Function)` -#### `rename(x::Index, nd::Associative)` -#### `rename(x::Index, from::Vector, to::Vector)` - -Replace names in `x`, by applying function `f` to each name, -by mapping old to new names with a dictionary (Associative), or using -`from` and `to` vectors. - -#### `getindex(x::Index, idx)` or `x[idx]` - -This does the mapping from name(s) to Indices (positions). `idx` may -be ByteString, Vector{ByteString}, Int, Vector{Int}, Range{Int}, -Vector{Bool}, AbstractDataVector{Bool}, or AbstractDataVector{Int}. - -#### `set_group(idx::Index, newgroup, names::Vector{ByteString})` - -Add a group to `idx` with name `newgroup` that includes the names in -the vector `names`. - -#### `get_groups(idx::Index)` - -A Dict that maps the name of each group to the names in the group. - -#### `set_groups(idx::Index, gr::Dict)` - -Set groups in `idx` based on the mapping given by `gr`. - - -## Missing Values - -Missing value behavior is implemented by instantiations of the `AbstractDataVector` -abstract type. - -#### `NA` - -A constant indicating a missing value. - -#### `isna(x)` - -Return a `Bool` or `Array{Bool}` (if `x` is an `AbstractDataVector`) -that is `true` for elements with missing values. - -#### `nafilter(x)` - -Return a copy of `x` after removing missing values. - -#### `nareplace(x, val)` - -Return a copy of `x` after replacing missing values with `val`. - -#### `naFilter(x)` - -Return an object based on `x` such that future operations like `mean` -will not include missing values. This can be an iterator or other -object. - -#### `naReplace(x, val)` - -Return an object based on `x` such that future operations like `mean` -will replace NAs with `val`. - -#### `na(x)` - -Return an `NA` value appropriate for the type of `x`. - -#### `nas(x, dim)` - -Return an object like `x` filled with `NA`'s with size `dim`. - - -## DataVector's - -#### `DataArray(x::Vector)` -#### `DataArray(x::Vector, m::Vector{Bool})` - -Create a DataVector from `x`, with `m` optionally indicating which values -are NA. DataVector's are like Julia Vectors with support for NA's. `x` may -be any type of Vector. - -#### `PooledDataArray(x::Vector)` -#### `PooledDataArray(x::Vector, m::Vector{Bool})` - -Create a PooledDataVector from `x`, with `m` optionally indicating which -values are NA. PooledDataVector's contain a pool of values with references -to those values. This is useful in a similar manner to an R array of -factors. - -#### `size`, `length`, `ndims`, `ref`, `assign`, `start`, `next`, `done` - -All normal Vector operations including array referencing should work. - -#### `isna(x)`, `nafilter(x)`, `nareplace(x, val)`, `naFilter(x)`, `naReplace(x, val)` - -All NA-related methods are supported. - -## Utilities - -#### `cut(x::Vector, breaks::Vector)` - -Returns a PooledDataVector with length equal to `x` that divides values in `x` -based on the divisions given by `breaks`. - -## Formulas and Models - -#### `Formula(ex::Expr)` - -Return a Formula object based on `ex`. Formulas are two-sided -expressions separated by `~`, like `:(y ~ w*x + z + i&v)`. - -#### `model_frame(f::Formula, d::AbstractDataFrame)` -#### `model_frame(ex::Expr, d::AbstractDataFrame)` - -A ModelFrame. - -#### `model_matrix(mf::ModelFrame)` -#### `model_matrix(f::Formula, d::AbstractDataFrame)` -#### `model_matrix(ex::Expr, d::AbstractDataFrame)` - -A ModelMatrix based on `mf`, `f` and `d`, or `ex` and `d`. - -#### `lm(ex::Expr, df::AbstractDataFrame)` - -Linear model results (type OLSResults) based on formula `ex` and `df`. diff --git a/spec/JuliaChanges.md b/spec/JuliaChanges.md deleted file mode 100644 index 6c99c74211..0000000000 --- a/spec/JuliaChanges.md +++ /dev/null @@ -1,43 +0,0 @@ -## Possible changes to Julia syntax - -DataFrames fit well with Julia's syntax, but some features would -improve the user experience. - -### Keyword function arguments - -[Issue 485](https://github.com/JuliaLang/julia/issues/485) - -With many functions, it would be nice to have options. options.jl is -nice, but it is still clumsy from the user's point of view. - -DataFrame creation would be cleaner: - -```julia -d = DataFrame(a = [1:20], - b = PooledDataArray([1:20])) -``` - -In addition, a number of existing and planned functions are calling -out for optional arguments. - -### ~ for easier expression syntax - -It'd be nice to be able to do: - -```julia - by(df[~ a > 3], ["b", "c"], ~ x_sum = sum(x); y_mean = mean(y)) -``` -A two-sided version would allow better formulas: - -```julia - lm(a ~ b) -``` - -~ is currently used as bitwise not, but it looks like it's not used -much, and this could be replaced by ! or by a function. - - -### Overloading . - -df.col1 is nicer than df["col1"] for column access. - diff --git a/spec/basics.md b/spec/basics.md deleted file mode 100644 index 161655a6a6..0000000000 --- a/spec/basics.md +++ /dev/null @@ -1,10 +0,0 @@ -# AbstractDataFrame - -An AbstractDataFrame is an in-memory database that consists of named columns, each of which can contain missing data. Every column has a well-defined type, but different columns can have different types. An AbstractDataFrame can be accessed using numeric indexing for both rows and columns and name-based -indexing for columns. - -Current subtypes of AbstractDataFrame include DataFrame and SubDataFrame. - -# DataFrame - -A DataFrame is a vector of heterogeneous AbstractDataVector's that be accessed using numeric indexing for both rows and columns and name-based indexing for columns. The columns are stored in a vector, which means that operations that insert/delete columns are O(n). diff --git a/spec/show.md b/spec/show.md deleted file mode 100644 index bc0a08710c..0000000000 --- a/spec/show.md +++ /dev/null @@ -1,178 +0,0 @@ -# Printing DataFrames - -AbstractDataFrames are rendered to the REPL window using a best-effort strategy that attempts to ensure that the output is always readable within the horizontal and vertical boundaries of the REPL window. - -Admittedly, this does not always produce a visually appealing output. In many ways, the static window offered by a standard 80 character by 24 character REPL is a bad medium in which to render DataFrames: there is not enough vertical space to show all rows and there is also not enough horizontal space to show all columns. - -Unlike Arrays, whose homogeneity of type makes horizontal truncation a reasonable default, a lot of important information about the schema for an AbstractDataFrame is lost if a large number of the AbstractDataFrame's columns are not displayed in the REPL. - -# Printing Strategy - -The DataFrames.jl package therefore employs the following best-effort strategy for rendering an AbstractDataFrame to the REPL window. - -* Determine how much horizontal width would be required to render every column of the AbstractDataFrame, including the implicit "Row #" column contained in every AbstractDataFrame. -* If the horizontal width fits within the visible REPL window, render the AbstractDataFrame as a table that shows the head and tail in complete detail. Under this default behavior, the number of rows shown is guaranteed to fit in the vertical height of the REPL window. -* If the horizontal width required for complete output exceeds the amount of space available in the REPL window, we do not attempt to print out all of the columns. Instead, we print out a summary of the DataFrame's schema in three parts: (1) the name of each column, (2) the type of each column and (3) the number of missing entries in each column. This summary is **not** guaranteed to fit in the vertical height of the REPL window. -* If the full set of columns of the AbstractDataFrame would not fit in the horizontal width of the REPL window, the user may request that the text representation of the AbstractDataFrame will be paginated into chunks that exhausitively display every column. Each of these chunks is guaranteed to fit within the horizontal width of the REPL. -* If the full set of rows of the AbstractDataFrame would not fit in the vertical height of the REPL window, the user may request that all rows be shown in the REPL using the `showall` function. The output of this is **not** guaranteed to fit in the horizontal width of the REPL window. As with the shortened summary described earlier, the user may additionally request that this output be paginated into chunks that exhausitively display every column. Each of these chunks is guaranteed to fit within the horizontal width of the REPL. - -# Functions - -For the end-user, this display strategy leads to four possible function calls: - -* `show(adf)`: If the columns fit in the window, show the head and tail of the DataFrame with all columns. If the columns do not fit in the window, show a summary of the table in terms of its schema instead. *This function is the default used by the REPL.* -* `show(adf, true)`: Show the head and tail of the DataFrame with all columns, no matter what. If necessary, the output will be paginated so that each chunk of output fits within the horizontal width of the window. -* `showall(adf)`: Show all of the DataFrame's contents, including all rows and columns. The size of the REPL window is ignored. -* `showall(adf, true)`: Show all of the DataFrame's rows, but paginate the output so that each chunk fits in the horizontal width of the REPL window. - -In addition to all the properties described above, the output is always formatted as a valid MultiMarkdown table that can be used anywhere that supports complex Markdown. This makes it especially easy to use DataFrames to organize data for reporting on GitHub. - -# Usage Examples of Expected Output - -```julia -julia> using DataFrames - -julia> df = DataFrame(A = [repeat("a", 40) for i in 1:24], - B = [repeat("b", 40) for i in 1:24]) -# 2x3 DataFrame -# | Col # | Name | Type | Missing | -# |-------|------|-------------|---------| -# | 1 | A | ASCIIString | 0 | -# | 2 | B | ASCIIString | 0 | - -julia> show(df) -# 2x3 DataFrame -# | Col # | Name | Type | Missing | -# |-------|------|-------------|---------| -# | 1 | A | ASCIIString | 0 | -# | 2 | B | ASCIIString | 0 | - -julia> show(df, true) -# 24x2 DataFrame -# | Row # | A | -# |-------|------------------------------------------| -# | 1 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 2 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 3 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 4 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 5 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 6 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 7 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 8 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# â‹® -# | 16 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 17 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 18 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 19 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 20 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 21 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 22 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 23 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 24 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# -# | Row # | B | -# |-------|------------------------------------------| -# | 1 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 2 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 3 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 4 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 5 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 6 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 7 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 8 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# â‹® -# | 16 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 17 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 18 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 19 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 20 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 21 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 22 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 23 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 24 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | - -julia> showall(df) -# 24x2 DataFrame -# | Row # | A | B | -# |-------|------------------------------------------|------------------------------------------| -# | 1 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 2 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 3 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 4 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 5 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 6 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 7 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 8 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 9 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 10 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 11 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 12 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 13 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 14 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 15 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 16 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 17 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 18 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 19 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 20 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 21 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 22 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 23 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 24 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | - -julia> showall(df, true) -# 24x2 DataFrame -# | Row # | A | -# |-------|------------------------------------------| -# | 1 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 2 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 3 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 4 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 5 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 6 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 7 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 8 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 9 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 10 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 11 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 12 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 13 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 14 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 15 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 16 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 17 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 18 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 19 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 20 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 21 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 22 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 23 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# | 24 | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa | -# -# | Row # | B | -# |-------|------------------------------------------| -# | 1 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 2 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 3 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 4 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 5 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 6 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 7 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 8 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 9 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 10 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 11 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 12 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 13 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 14 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 15 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 16 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 17 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 18 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 19 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 20 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 21 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 22 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 23 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -# | 24 | bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb | -``` diff --git a/sphinxdoc/other/design_details.rst b/sphinxdoc/other/design_details.rst deleted file mode 100644 index 426f5efde4..0000000000 --- a/sphinxdoc/other/design_details.rst +++ /dev/null @@ -1,781 +0,0 @@ -************************ -The Design of DataFrames -************************ - -The Type Hierarchy ------------------- - -Before we do anything else, let's go through the hierarchy of types introduced by the DataFrames package. This type hierarchy is depicted visually in the figures at the end of this section and can be summarized in a simple nested list: - -* NAtype -* AbstractDataVector - - * DataVector - * PooledDataVector - -* AbstractMatrix - - * DataMatrix - -* AbstractDataArray - - * DataArray - -* AbstractDataFrame - - * DataFrame - -* AbstractDataStream - - * FileDataStream - * DataFrameDataStream - * MatrixDataStream - -We'll step through each element of this hierarchy in turn in the following sections. - -Overview of Basic Types for Working with Data ---------------------------------------------- - -There are four new types introduced by the current generation of the DataFrames package: - -* NAType: A scalar value that represents a single missing piece of data. This value behaves much like ``NA`` in R. -* DataVector: A vector that can contain values of a specific type as well as ``NA`` values. -* PooledDataVector: An alternative to DataVector's that can be more memory-efficient if a small number of distinc values are present in the underlying vector of data. -* DataFrame: A tabular data structure that is similar to R's ``data.frame`` and Pandas' ``DataFrame``. - -In the future, we will also be introducing generic Arrays of arbitrary dimension. After this, we will provide two new types: - -* DataMatrix: A matrix that can contain values of a specific type as well as ``NA`` values. -* DataFrame: An array that can contain values of a specific type as well as ``NA`` values. - -The ``NA`` Type -=============== - -The core problem with using the data structures built into Julia for data analysis is that there is no mechanism for expressing the absence of data. Traditional database systems express the absence of data using a ``NULL`` value, while data analysis packages typically follow the tradition set by S and use ``NA`` for this purpose when referring to data. (NB: *In S and R, ``NULL`` is present in addition to ``NA``, but it refers to the absence of any specific value for a variable in code, rather than the absence of any specific value for something inside of a data set.*) - -The DataFrames package expresses the absence of data by introducing a new type called ``NAtype``. This value is used everywhere to indicate missingness in the underlying data set. - -To see this value, you can type:: - - NAtype - -in the Julia REPL. You can learn more about the nature of this new type using standard Julia functions for navigating Julia's type system:: - - typeof(NAtype) - - super(NAtype) - - dump(NAtype) - -While the ``NAtype`` provides the essential type needed to express missingness, the practical way that missing data is denoted uses a special constant ``NA``, which is an instance of ``NAtype``:: - - NA - NAtype() - -You can explore this value to confirm that ``NA`` is just an instance of the ``NAtype``:: - - typeof(NA) - - dump(NA) - -Simply being able to express the notion that a data point is missing is important, but we're ultimately not interested in just expressing data: we want to build tools for interacting with data that may be missing. In a later section, we'll describe the details of interacting with ``NA``, but for now we'll state the defining property of ``NA``: *because ``NA`` expresses ignorance about the value of something, every interaction with ``NA`` corrupts known values and transforms them into ``NA`` values*. Below we show how this works for addition:: - - 1 + NA - -We'll discuss the subtleties of ``NA`` values ability to corrupt known values in a later section. For now the essential point is this: ``NA`` values exist to represent missingness that occurs in scalar data. - -The DataVector Type -=================== - -To express the notion that a complex data structure like an ``Array`` contains missing entries, we need to construct a new data structure that can contain standard Julia values like ``Float64`` while also allowing the presence of ``NA`` values. - -Of course, a Julian ``Array{Any}`` would allow us to do this:: - - {1, NA} - -But consistently using ``Any`` arrays would make Julia much less efficient. Instead, we want to provide a new data structure that parallels a standard Julia ``Array``, while allowing exactly one additional value: ``NA``. - -This new data structure is the ``DataVector`` type. You can construct your first ``DataVector`` using the following code:: - - DataVector[1, NA, 3] - -As you'll see when entering this into the REPL, this snippet of code creates a ``3-element DataVector{Int}``. A ``DataVector`` of type ``DataVector{Int}`` can store ``Int`` values or ``NA`` values. In general, a ``DataVector`` of type ``DataVector{T}`` can store values of type ``T`` or ``NA`` values. - -This is achieved by a very simple mechanism: a ``DataVector{T}`` is a new parametric composite type that we've added to Julia that wraps around a standard Julia ``Vector`` and complements this basic vector with a metadata store that indicates whether any entry of the wrapped vector is missing. In essence, a ``DataVector`` of type ``T`` is defined as:: - - type DataVector{T} - data::Vector{T} - na::BitVector - end - -This allows us to assess whether any entry of the vector is ``NA`` at the cost of exactly one additional bit per item. We are able to save space by using ``BitArray`` instead of an ``Array{Bool}``. At present, we store the non-missing data values in a vector called ``data`` and we store the metadata that indicates which values are missing in a vector called ``na``. But end-users should not worry about these implementation details. - -Instead, you can simply focus on the behavior of the ``DataVector`` type. Let's start off by exploring the basic properties of this new type:: - - DataVector - - typeof(DataVector) - typeof(DataVector{Int}) - - super(DataVector) - super(super(DataVector)) - - DataVector.names - -If you want to drill down further, you can always run ``dump()``:: - - dump(DataVector) - -We're quite proud that the definition of ``DataVector`` is so simple: it makes it easier for end-users to start contributing code to the DataFrames package. - -Constructing DataVector's -========================= - -Let's focus on ways that you can create new ``DataVector``. The simplest possible constructor requires the end-user to directly specify both the underlying data values and the missingness metadata as a ``BitVector``:: - - dv = DataArray([1, 2, 3], falses(3)) - -This is rather ugly, so we've defined many additional constructors that make it easier to create a new ``DataVector``. The first simplification is to ignore the distinction between a ``BitVector`` and an ``Array{Bool, 1}`` by allowing users to specify ``Bool`` values directly:: - - dv = DataArray([1, 2, 3], [false, false, false]) - -In practice, this is still a lot of useless typing when all of the values of the new ``DataVector`` are not missing. In that case, you can just pass a Julian vector:: - - dv = DataArray([1, 2, 3]) - -When the values you wish to store in a ``DataVector`` are sequential, you can cut down even further on typing by using a Julian ``Range``:: - - dv = DataArray(1:3) - -In contrast to these normal-looking constructors, when some of the values in the new ``DataVector`` are missing, there is a very special type of constructor you can use:: - - dv = DataVector[1, 2, NA, 4] - -*Technical Note*: This special type of constructor is defined by overloading the ``getindex()`` function to apply to values of type ``DataVector``. - -DataVector's with Special Types -=============================== - -One of the virtues of using metadata to represent missingness instead of sentinel values like ``NaN`` is that we can easily define ``DataVector`` over arbitrary types. For example, we can create ``DataVector`` that store arbitrary Julia types like ``ComplexPair`` and ``Bool``:: - - dv = DataArray([1 + 2im, 3 - 1im]) - - dv = DataArray([true, false]) - -In fact, we can add a new type of our own and then wrap it inside of a new sort of ``DataVector``:: - - type MyNewType - a::Int - b::Int - c::Int - end - - dv = DataArray([MyNewType(1, 2, 3), MyNewType(2, 3, 4)]) - -Of course, specializing the types of ``DataVector`` means that we sometimes need to convert between types. Just as Julia has several specialized conversion functions for doing this, the DataFrames package provides conversion functions as well. For now, we have three such functions: - -* ``dataint()`` -* ``datafloat()`` -* ``databool()`` - -Using these, we can naturally convert between types:: - - dv = DataArray([1.0, 2.0]) - - dataint(dv) - -In the opposite direction, we sometimes want to create arbitrary length ``DataVector`` that have a specific type before we insert values:: - - dv = DataArray(Float64, 5) - - dv[1] = 1 - -``DataArray`` created in this way have ``NA`` in all entries. If you instead wish to initialize a ``DataArray`` with standard initial values, you can use one of several functions: - -* ``datazeros()`` -* ``dataones()`` -* ``datafalses()`` -* ``datatrues()`` - -Like the similar functions in Julia's Base, we can specify the length and type of these initialized vectors:: - - dv = datazeros(5) - dv = datazeros(Int, 5) - - dv = dataones(5) - dv = dataones(Int, 5) - - dv = datafalses(5) - - dv = datatrues(5) - -The PooledDataArray Type -======================== - -On the surface, ``PooledDataArray``s look like ``DataArray``s, but their implementation allows the efficient storage and manipulation of ``DataVector``s and ``DataArrays`` which only contain a small number of values. Internally, ``PooledDataArray``s hold a pool of unique values, and the actual ``DataArray`` simply indexes into this pool, rather than storing each value individually. - -A ``PooledDataArray`` can be constructed from an ``Array`` or ``DataArray``, and as with regular ``DataArray``s, it can hold ``NA`` values:: - - pda = PooledDataArray([1, 1, 1, 1, 2, 3, 2, 2, 3, 3, 3]) - pda2 = PooledDataArray(DataArray["red", "green", "yellow", "yellow", "red", "orange", "red", "green"]) - -``PooledDataArray``s can also be created empty or with a fixed size and a specific type:: - - pda3 = PooledDataArray(String, 2000) # A pooled data array of 2000 strings, intially filled with NAs - pda4 = PooledDataArray(Float64) # An empty pooled data array of floats - -By default, the index into the pool of values is a Uint32, allowing 2^32 possible pool values. If you know that you will only have a much smaller number of unique values, you can specify a smaller reference index type, to save space:: - - pda5 = PooledDataArray(String, Uint8, 5000, 2) # Create a 5000x2 array of String values, - # initialized to NA, - # with at most 2^8=256 unique values - -``PooledDataVectors``s can be used as columns in DataFrames. - - -The DataFrame Type -================== - -While ``DataVector`` are a very powerful tool for dealing with missing data, they only bring us part of the way towards representing real-world data in Julia. The final missing data structure is a tabular data structure of the sort used in relational databases and spreadsheet software. - -To represent these kinds of tabular data sets, the DataFrames package provides the ``DataFrame`` type. The ``DataFrame`` type is a new Julian composite type with just two fields: - -* ``columns``: A Julia ``Vector{Any}``, each element of which will be a single column of the tabular data. The typical column is of type ``DataVector{T}``, but this is not strictly required. -* ``colindex``: An ``Index`` object that allows one to access entries in the columns using both numeric indexing (like a standard Julian ``Array``) or key-valued indexing (like a standard Julian ``Dict``). The details of the ``Index`` type will be described later; for now, we just note that an ``Index`` can easily be constructed from any array of ``ByteString``. This array is assumed to specify the names of the columns. For example, you might create an index as follows: ``Index(["ColumnA", "ColumnB"])``. - -In the future, we hope that there will be many different types of ``DataFrame``-like constructs. But all objects that behave like a ``DataFrame`` will behave according to the following rules that are enforced by an ``AbstractDataFrame`` protocol: - -* A DataFrame-like object is a table with ``M`` rows and ``N`` columns. -* Every column of a DataFrame-like object has its own type. This heterogeneity of types is the reason that a DataFrame cannot simply be represented using a matrix of ``DataVector``. -* Each columns of a DataFrame-like object is guaranteed to have length ``M``. -* Each columns of a DataFrame-like object is guaranteed to be capable of storing an ``NA`` value if one is ever inserted. NB: *There is ongoing debate about whether the columns of a DataFrame should always be ``DataVector`` or whether the columns should only be converted to ``DataVector`` if an ``NA`` is introduced by an assignment operation.* - -Constructing DataFrame's -======================== - -Now that you understand what a ``DataFrame`` is, let's build one:: - - df_columns = {datazeros(5), datafalses(5)} - df_colindex = Index(["A", "B"]) - - df = DataFrame(df_columns, df_colindex) - -In practice, many other constructors are more convenient to use than this basic one. The simplest convenience constructors is to provide only the columns, which will produce default names for all the columns:: - - df = DataFrame(df_columns) - -One often would like to construct ``DataFrame`` from columns which may not yet be ``DataVector``. This is possible using the same type of constructor. All columns that are not yet ``DataVector`` will be converted to ``DataVector``:: - - df = DataFrame({ones(5), falses(5)}) - -Often one wishes to convert an existing matrix into a ``DataFrame``. This is also possible:: - - df = DataFrame(ones(5, 3)) - -Like ``DataVector``, it is possible to create empty ``DataFrame`` in which all of the default values are ``NA``. In the simplest version, we specify a type, the number of rows and the number of columns:: - - df = DataFrame(Int, 10, 5) - -Alternatively, one can specify a ``Vector`` of types. This implicitly defines the number of columns, but one must still explicitly specify the number of rows:: - - df = DataFrame({Int, Float64}, 4) - -When you know what the names of the columns will be, but not the values, it is possible to specify the column names at the time of construction. - -*SHOULD THIS BE ``DataFrame(types, nrow, names)`` INSTEAD?*:: - - DataFrame({Int, Float64}, ["A", "B"], 10) - DataFrame({Int, Float64}, Index(["A", "B"]), 10) # STILL NEED TO MAKE THIS WORK - -A more uniquely Julian way of creating ``DataFrame`` exploits Julia's ability to quote ``Expression`` in order to produce behavior like R's delayed evaluation strategy:: - - df = DataFrame(quote - A = rand(5) - B = datatrues(5) - end) - -Accessing and Assigning Elements of DataVector's and DataFrame's -================================================================ - -Because a ``DataVector`` is a 1-dimensional Array, indexing into it is trivial and behaves exactly like indexing into a standard Julia vector. :: - - dv = dataones(5) - dv[1] - dv[5] - dv[end] - dv[1:3] - dv[[true, true, false, false, false]] - - dv[1] = 3 - dv[5] = 5.3 - dv[end] = 2.1 - dv[1:3] = [3.2, 3.2, 3.1] - dv[[true, true, false, false, false]] = dataones(2) # SHOULD WE MAKE THIS WORK? - - -In contrast, a DataFrame is a random-access data structure that can be indexed into and assigned to in many different ways. We walk through many of them below. - -Simple Numeric Indexing ------------------------ - -Index by numbers:: - - df = DataFrame(Int, 5, 3) - df[1, 3] - df[1] - - -Range-Based Numeric Indexing ----------------------------- - -Index by ranges:: - - df = DataFrame(Int, 5, 3) - - df[1, :] - df[:, 3] - df[1:2, 3] - df[1, 1:3] - df[:, :] - -Column Name Indexing --------------------- - -Index by column names:: - - df["x1"] - df[1, "x1"] - df[1:3, "x1"] - - df[["x1", "x2"]] - df[1, ["x1", "x2"]] - df[1:3, ["x1", "x2"]] - -Unary Operators for NA, DataVector's and DataFrame's -==================================================== - -In practice, we want to compute with these new types. The first requirement is to define the basic unary operators: - -* ``+`` -* ``-`` -* ``!`` -* *MISSING: The transpose unary operator* - -You can see these operators in action below:: - - +NA - -NA - !NA - - +dataones(5) - -dataones(5) - !datafalses(5) - -Binary Operators ----------------- - -* Arithmetic Operators: - - * Scalar Arithmetic: ``+``, ``-``, ``*``, ``/``, ``^``, - * Array Arithmetic: ``+``, ``.+``, ``-``, ``.-``, ``.*``, ``./``, ``.^`` - -* Bit Operators: ``&``, ``|``, ``$`` -* Comparison Operators: - - * Scalar Comparisons: ``==``, ``!=``, ``<``, ``<=``, ``>``, ``>=`` - * Array Comparisons: ``.==``, ``.!=``, ``.<``, ``.<=``, ``.>``, ``.>=`` - -The standard arithmetic operators work on DataVector's when they interact with Number's, NA's or other DataVector's. :: - - dv = dataones(5) - dv[1] = NA - df = DataFrame(quote - a = 1:5 - end) - -NA's with NA's --------------- - -:: - - NA + NA - NA .+ NA - -And so on for ``-``, ``.-``, ``*``, ``.*``, ``/``, ``./``, ``^``, ``.^``. - -NA's with Scalars and Scalars with NA's ---------------------------------------- - -:: - - 1 + NA - 1 .+ NA - NA + 1 - NA .+ 1 - -And so on for ``-``, ``.-``, ``*``, ``.*``, ``/``, ``./``, ``^``, ``.^``. - -NA's with DataVector's ----------------------- - -:: - - dv + NA - dv .+ NA - NA + dv - NA .+ dv - -And so on for ``-``, ``.-``, ``*``, ``.*``, ``/``, ``./``, ``^``, ``.^``. - -DataVector's with Scalars -------------------------- - -:: - - dv + 1 - dv .+ 1 - -And so on for ``-``, ``.-``, ``.*``, ``./``, ``.^``. - -Scalars with DataVector's -------------------------- - -:: - - 1 + dv - 1 .+ dv - -And so on for ``-``, ``.-``, ``*``, ``.*``, ``/``, ``./``, ``^``, ``.^``. - -*HOW MUCH SHOULD WE HAVE OPERATIONS W/ DATAFRAMES?* - -:: - - NA + df - df + NA - 1 + df - df + 1 - dv + df # SHOULD THIS EXIST? - df + dv # SHOULD THIS EXIST? - df + df - -And so on for ``-``, ``.-``, ``.*``, ``./``, ``.^``. - -The standard bit operators work on ``DataVector``: - -*TO BE FILLED IN* - -The standard comparison operators work on ``DataVector``:: - - NA .< NA - NA .< "a" - NA .< 1 - NA .== dv - - dv .< NA - dv .< "a" - dv .< 1 - dv .== dv - - df .< NA - df .< "a" - df .< 1 - df .== dv # SHOULD THIS EXIST? - df .== df - -Elementwise Functions ---------------------- - -* ``abs`` -* ``sign`` -* ``acos`` -* ``acosh`` -* ``asin`` -* ``asinh`` -* ``atan`` -* ``atan2`` -* ``atanh`` -* ``sin`` -* ``sinh`` -* ``cos`` -* ``cosh`` -* ``tan`` -* ``tanh`` -* ``ceil`` -* ``floor`` -* ``round`` -* ``trunc`` -* ``signif`` -* ``exp`` -* ``log`` -* ``log10`` -* ``log1p`` -* ``log2`` -* ``exponent`` -* ``sqrt`` - -Standard functions that apply to scalar values of type ``Number`` return ``NA`` when applied to ``NA``:: - - abs(NA) - -Standard functions are broadcast to the elements of ``DataVector`` and ``DataFrame`` for elementwise application:: - - dv = dataones(5) - df = DataFrame({dv}) - - abs(dv) - abs(df) - -Pairwise Functions ------------------- - -* ``diff`` - -Functions that operate on pairs of entries of a ``Vector`` work on ``DataVector`` and insert ``NA`` where it would be produced by other operator rules:: - - diff(dv) - -Cumulative Functions --------------------- - -* ``cumprod`` -* ``cumsum`` -* ``cumsum_kbn`` -* MISSING: ``cummin`` -* MISSING: ``cummax`` - -Functions that operate cumulatively on the entries of a ``Vector`` work on ``DataVector`` and insert ``NA`` where it would be produced by other operator rules:: - - cumprod(dv) - cumsum(dv) - cumsum_kbn(dv) - -Aggregative Functions ---------------------- - -* ``minimum`` -* ``maximum`` -* ``prod`` -* ``sum`` -* ``mean`` -* ``median`` -* ``std`` -* ``var`` -* ``fft`` -* ``norm`` - -You can see these in action:: - - minimum(dv) - -To broadcast these to individual columns, use the ``col*s`` versions: - -* ``colmins`` -* ``colmaxs`` -* ``colprods`` -* ``colsums`` -* ``colmeans`` -* ``colmedians`` -* ``colstds`` -* ``colvars`` -* ``colffts`` -* ``colnorms`` - -You can see these in action:: - - colmins(df) - -Loading Standard Data Sets -========================== - -The DataFrames package is easiest to explore if you also install the RDatasets package, which provides access to 570 classic data sets:: - - require("RDatasets") - - iris = RDatasets.data("datasets", "iris") - dia = RDatasets.data("ggplot2", "diamonds") - -Split-Apply-Combine -------------------- - -The basic mechanism for spliting data is the ``groupby()`` function, which will produce a ``GroupedDataFrame`` object that is easiest to interact with by iterating over its entries:: - - for df in groupby(iris, "Species") - println("A DataFrame with $(nrow(df)) rows") - end - -The ``|>`` (pipe) operator for ``GroupedDataFrame`` allows you to run simple functions on the columns of the induced ``DataFrame``. You pass a simple function by producing a symbol with its name:: - - groupby(iris, "Species") |> :mean - -Another simple way to split-and-apply (without clear combining) is to use the ``map()`` function:: - - map(df -> mean(df[1]), groupby(iris, "Species")) - -Reshaping -========= - -If you are looking for the equivalent of the R "Reshape" packages ``melt()`` and ``cast()`` functions, you can use ``stack()`` and ``unstack()``. Note that these functions have exactly the oppposite syntax as ``melt()`` and ``cast()``:: - - stack(iris, ["Petal.Length", "Petal.Width"]) - -Model Formulas -============== - -Design ------- - -Once support for missing data and tabular data structures are in place, we need to begin to develop a version of the model formulas "syntax" used by R. In reality, it is better to regard this "syntax" as a complete domain-specific language (DSL) for describing linear models. For those unfamilar with this DSL, we show some examples below and then elaborate upon them to demonstrate ways in which Julia might move beyond R's formula system. - -Let's consider the simplest sort of linear regression model: how does the height of a child depend upon the height of the child's mother and father? If we let the variable ``C`` denote the height of the child, ``M`` the height of the mother and ``F`` the height of the father, the standard linear model approach in statistics would try to model their relationship using the following equation: ``C = a + bM + cF + epsilon``, where ``a``, ``b`` and ``c`` are fixed constants and ``epsilon`` is a normally distributed noise term that accounts for the imperfect match between any specific child's height and the predictions based solely on the heights of that child's mother and father. - -In practice, we would fit such a model using a function that performs linear regression for us based on information about the model and the data source. For example, in R we would write ``lm(C ~ M + F, data = heights.data)`` to fit this model, assuming that ``heights.data`` refers to a tabular data structure containing the heights of the children, mothers and fathers for which we have data. - -If we wanted to see how the child's height depends only on the mother's height, we would write ``lm(C ~ M)``. If we were concerned only about dependence on the father's height, we would write ``lm(C ~ H)``. As you can see, we can perform many different statistical analyses using a very consise language for describing those analyses. - -What is that language? The R formula language allows one to specify linear models by specifying the terms that should be included. The language is defined by a very small number of constructs: - -* The ``~`` operator: The ``~`` operator separates the pieces of a Formula. For linear models, this means that one specifies the outputs to be predicted on the left-hand side of the ``~`` and the inputs to be used to make predictions on the right-hand side. -* The ``+`` operator: If you wish to include multiple predictors in a linear model, you use the ``+`` operator. To include both the columns ``A`` and ``B`` while predicting ``C``, you write: ``C ~ A + B``. -* The ``&`` operator: The ``&`` operator is equivalent to ``:`` in R. It computes interaction terms, which are really an entirely new column created by combining two existing columns. For example, ``C ~ A&B`` describes a linear model with only one predictor. The values of this predictor at row ``i`` is exactly ``A[i] * B[i]``, where ``*`` is the standard arithmetic multiplication operation. Because of the precedence rules for Julia, it was not possible to use a ``:`` operator without writing a custom parser. -* The ``*`` operator: The ``*`` operator is really shorthand because ``C ~ A*B`` expands to ``C ~ A + B + A:B``. In other words, in a DSL with only three operators, the ``*`` is just syntactic sugar. - -In addition to these operators, the model formulas DSL typically allows us to include simple functions of single columns such as in the example, ``C ~ A + log(B)``. - -For Julia, this DSL will be handled by constructing an object of type ``Formula``. It will be possible to generate a ``Formula`` using explicitly quoted expression. For example, we might write the Julian equivalent of the models above as ``lm(:(C ~ M + F), heights*data)``. A ``Formula`` object describes how one should convert the columns of a ``DataFrame`` into a ``ModelMatrix``, which fully specifies a linear model. *MORE DETAILS NEEDED ABOUT HOW ``ModelMatrix`` WORKS.* - -How can Julia move beyond R? The primary improvement Julia can offer over R's model formula approach involves the use of hierarchical indexing of columns to control the inclusion of groups of columns as predictors. For example, a text regression model that uses word counts for thousands of different words as columns in a ``DataFrame`` might involve writing ``IsSpam ~ Pronouns + Prepositions + Verbs`` to exclude most words from the analysis except for those included in the ``Pronouns``, ``Prepositions`` and ``Verbs`` groups. In addition, we might try to improve upon some of the tricks R provides for writing hierarchical models in which each value of a categorical predictor gets its own coefficients. This occurs, for example, in hierarchical regression models of the sort implemented by R's ``lmer`` function. In addition, there are plans to support multiple LHS and RHS components of a ``Formula`` using a ``|`` operator. - -Implementation --------------- - -DETAILS NEEDED - -Factors -======= - -Design ------- - -As noted above, statistical data often involves that are not quantitative, but qualitative. Such variables are typically called categorical variables and can take on only a finite number of different values. For example, a data set about people might contain demographic information such as gender or nationality for which we can know the entire set of possible values in advance. Both gender and nationality are categorical variables and should not be represented using quantitative codes unless required as this is confusing to the user and mathematically suspect since the numbering used is entirely artificial. - -In general, we can require that a ``Factor`` type allow us to express variables that can take on a known, finite list of values. This finite list is called the levels of a ``Factor``. In this sense, a ``Factor`` is like an enumeration type. - -What makes a ``Factor`` more specialized than an enumeration type is that modeling tools can interpret factors using indicator variables. This is very important for specifying regression models. For example, if we run a regression in which the right-hand side includes a gender ``Factor``, the regression function can replace this factor with two dummy variable columns that encode the levels of this factor. (In practice, there are additional complications because of issues of identifiability or collinearity, but we ignore those for the time being and address them in the Implementation section.) - -In addition to the general ``Factor`` type, we might also introduce a subtype of the ``Factor`` type that encodes ordinal variables, which are categorical variables that encode a definite ordering such as the values, "very unhappy", "unhappy", "indifferent", "happy" and "very happy". By introducing an ``OrdinalFactor`` type in which the levels of this sort of ordinal factor are represented in their proper ordering, we can provide specialized functionality like ordinal logistic regression that go beyond what is possible with ``Factor`` types alone. - -Implementation --------------- - -We have a ``Factor`` type that handles ``NA``s. This type is currently implemented using ``PooledDataVector``. - -DataStreams -=========== - -Specification of DataStream as an Abstract Protocol ---------------------------------------------------- - -A ``DataStream`` object allows one to abstractly write code that processes streaming data, which can be used for many things: - -* Analysis of massive data sets that cannot fit in memory -* Online analysis in which interim answers are required while an analysis is still underway - -Before we begin to discuss the use of ``DataStream`` in Julia, we need to distinguish between streaming data and online analysis: - -* Streaming data involves low memory usage access to a data source. Typically, one demands that a streaming data algorithm use much less memory than would be required to simply represent the full raw data source in main memory. -* Online analysis involves computations on data for which interim answers must be available. For example, given a list of a trillion numbers, one would like to have access to the estimated mean after seeing only the first *N* elements of this list. Online estimation is essential for building practical statistical systems that will be deployed in the wild. Online analysis is the *sine qua non* of active learning, in which a statistical system selects which data points it will observe next. - -In Julia, a ``DataStream`` is really an abstract protocol implemented by all subtypes of the abstract type, ``AbstractDataStream``. This protocol assumes the following: - -* A ``DataStream`` provides a connection to an immutable source of data that implements the standard iterator protocol use throughout Julia: - - * ``start(iter)``: Get initial iteration state. - * ``next(iter, state)``: For a given iterable object and iteration state, return the current item and the next iteration state. - * ``done(iter, state)``: Test whether we are done iterating. - -* Each call to ``next()`` causes the ``DataStream`` object to read in a chunk of rows of tabular data from the streaming source and store these in a ``DataFrame``. This chunk of data is called a minibatch and its maximum size is specified at the time the DataStream is created. It defaults to *1* if no size is explicitly specified. -* All rows from the data source must use the same tabular schema. Entries may be missing, but this missingness must be represented explicitly by the ``DataStream`` using ``NA`` values. - -Ultimately, we hope to implement a variety of ``DataStream`` types that wrap access to many different data sources like CSV files and SQL databases. At present, have only implemented the ``FileDataStream`` type, which wraps access to a delimited file. In the future, we hope to implement: - -* MatrixDataStream -* DataFrameDataStream -* SQLDataStream -* Other tabular data sources like Fixed Width Files - -Thankfully the abstact ``DataStream`` protocol allows one to specify algorithms without regard for the specific type of ``DataStream`` being used. NB: *NoSQL databases are likely to be difficult to support because of their flexible schemas. We will need to think about how to interface with such systems in the future.* - -Constructing DataStreams ------------------------- - -The easiest way to construct a ``DataStream`` is to specify a filename:: - - ds = DataStream("my_data_set.csv") - -You can then iterate over this ``DataStream`` to see how things work:: - - for df in ds - print(ds) - end - -Use Cases for DataStreams -------------------------- - -We can compute many useful quantities using ``DataStream``: - -* *Means*: ``colmeans(ds)`` -* *Variances*: ``colvars(ds)`` -* *Covariances*: ``cov(ds)`` -* *Correlations*: ``cor(ds)`` -* *Unique element lists and counts*: *MISSING* -* *Linear models*: *MISSING* -* *Entropy*: *MISSING* - -Advice on Deploying DataStreams -------------------------------- - -* Many useful computations in statistics can be done online: - - * Estimation of means, including implicit estimation of means in Reinforcement Learning - * Estimation of entropy - * Estimation of linear regression models - -* But many other computations cannot be done online because they require completing a full pass through the data before quantities can be computed exactly. -* Before writing a DataStream algorith, ask yourself: "what is the performance of this algorithm if I only allow it to make one pass through the data?" - -References ----------- - -* McGregor: Crash Course on Data Stream Algorithms -* Muthukrishnan : Data Streams - Algorithms and Applications -* Chakrabarti: CS85 - Data Stream Algorithms -* Knuth: Art of Computer Programming - -Ongoing Debates about NA's -========================== - -* What are the proper rules for the propagation of missingness? It is clear that there is no simple absolute rule we can follow, but we need to formulate some general principles for how to set reasonable defaults. R's strategy seems to be: - - * For operations on vectors, ``NA`` values are absolutely poisonous by default. - * For operations on ``data.frames``, ``NA`` values are absolutely poisonous on a column-by-column basis by default. This stems from a more general which assumes that most operations on ``data.frame`` reduce to the aggregation of the same operation performed on each column independently. - * Every function should provide an ``na.rm`` option that allows one to ignore ``NA`` values. Essentially this involves replacing ``NA`` by the identity element for that function: ``sum(na.rm = TRUE)`` replaces ``NA`` values with ``0``, while ``prod(na.rm = TRUE)`` replaces ``NA`` values with ``1``. - -* Should there be multiple types of missingness? - - * For example, SAS distinguishes between: - - * Numeric missing values - * Character missing values - * Special numeric missing values - - * In statistical theory, while the *fact* of missingness is simple and does not involve multiple types of ``NA`` values, the *cause* of missingness can be different for different data sets, which leads to very different procedures that can appropriately be used. See, for example, the different suggestions in Little and Rubin (2002) about how to treat data that has entries missing completely at random (MCAR) vs. data that has entries missing at random (MAR). Should we be providing tools for handling this? External data sources will almost never provide this information, but multiple dispatch means that Julian statistical functions could insure that the appropriate computations are performed for properly typed data sets without the end-user ever understanding the process that goes on under the hood. - -* How is missingness different from ``NaN`` for ``Float``? Both share poisonous behavior and ``NaN`` propagation is very efficient in modern computers. This can provide a clever method for making ``NA`` fast for ``Float``, but does not apply to other types and seems potentially problematic as two different concepts are now aliased. For example, we are not uncertain about the value of ``0/0`` and should not allow any method to impute a value for it -- which any imputation method will do if we treat every ``NaN`` as equivalent to a ``NA``. -* Should cleverness ever be allowed in propagation of ``NA``? In section 3.3.4 of the R Language Definition, they note that in cases where the result of an operation would be the same for all possible values that an ``NA`` value could take on, the operation may return this constant value rather than return ``NA``. For example, ``FALSE & NA`` returns ``FALSE`` while ``TRUE | NA`` returns ``TRUE``. This sort of cleverness seems like a can-of-worms. - -Ongoing Debates about DataFrame's -================================= - -* How should RDBMS-like indices be implemented? What is most efficient? How can we avoid the inefficient vector searches that R uses? -* How should ``DataFrame`` be distributed for parallel processing? diff --git a/sphinxdoc/other/function_reference_guide.rst b/sphinxdoc/other/function_reference_guide.rst deleted file mode 100644 index 36e95af0ff..0000000000 --- a/sphinxdoc/other/function_reference_guide.rst +++ /dev/null @@ -1,589 +0,0 @@ -************************ -Function Reference Guide -************************ - -DataFrames -========== - -``DataFrame(cols::Vector, colnames::Vector{ByteString})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Construct a DataFrame from the columns given by ``cols`` with the index -generated by ``colnames``. A DataFrame inherits from -``Associative{Any,Any}``, so Associative operations should work. Columns -are vector-like objects. Normally these are AbstractDataVector's (DataVector's -or PooledDataVector's), but they can also (currently) include standard -Julia Vectors. - -``DataFrame(cols::Vector)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Construct a DataFrame from the columns given by ``cols`` with default -column names. - -``DataFrame()`` -^^^^^^^^^^^^^^^ - -An empty DataFrame. - -``copy(df::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^ - -A shallow copy of ``df``. Columns are referenced, not copied. - -``deepcopy(df::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A deep copy of ``df``. Copies of each column are made. - -``similar(df::DataFrame, nrow)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A new DataFrame with ``nrow`` rows and the same column names and types as ``df``. - - -Basics ------- - -``size(df)``, ``ndims(df)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Same meanings as for Arrays. - -``has(df, key)``, ``get(df, key, default)``, ``keys(df)``, and ``values(df)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Same meanings as Associative operations. ``keys`` are column names; -``values`` are column contents. - -``start(df)``, ``done(df,i)``, and ``next(df,i)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Methods to iterate over columns. - -``ncol(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Number of columns in ``df``. - -``nrow(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Number of rows in ``df``. - -``length(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Number of columns in ``df``. - -``isempty(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Whether the number of columns equals zero. - -``head(df::AbstractDataFrame)`` and ``head(df::AbstractDataFrame, i::Int)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -First ``i`` rows of ``df``. Defaults to 6. - -``tail(df::AbstractDataFrame)`` and ``tail(df::AbstractDataFrame, i::Int)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Last ``i`` rows of ``df``. Defaults to 6. - -``show(io, df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Standard pretty-printer of ``df``. Called by ``print()`` and the REPL. - -``dump(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Show the structure of ``df``. Like R's ``str``. - -``describe(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Show a description of each column of ``df``. - -``complete_cases(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A Vector{Bool} of indexes of complete cases in ``df`` (rows with no -NA's). - -``duplicated(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A Vector{Bool} of indexes indicating rows that are duplicates of prior -rows. - -``unique(df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -DataFrame with unique rows in ``df``. - - -Indexing, Assignment, and Concatenation ---------------------------------------- - -DataFrames are indexed like a Matrix and like an Associative. Columns -may be indexed by column name. Rows do not have names. Referencing -with one argument normally indexes by columns: ``df["col"]``, -``df[["col1","col3"]]`` or ``df[i]``. With two arguments, rows and columns -are selected. Indexing along rows works like Matrix indexing. Indexing -along columns works like Matrix indexing with the addition of column -name access. - -``getindex(df::DataFrame, ind)`` or ``df[ind]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Returns a subset of the columns of ``df`` as specified by ``ind``, which -may be an ``Int``, a ``Range``, a ``Vector{Int}``, ``ByteString``, or -``Vector{ByteString}``. Columns are referenced, not copied. For a -single-element ``ind``, the column by itself is returned. - -``getindex(df::DataFrame, irow, icol)`` or ``df[irow,icol]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Returns a subset of ``df`` as specified by ``irow`` and ``icol``. ``irow`` may -be an ``Int``, a ``Range``, or a ``Vector{Int}``. ``icol`` may be an ``Int``, a -``Range``, or a ``Vector{Int}``, ``ByteString``, or, ``ByteString``, or -``Vector{ByteString}``. For a single-element ``ind``, the column subset by -itself is returned. - -``index(df::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^ - -Returns the column ``Index`` for ``df``. - -``set_group(df::DataFrame, newgroup, names::Vector{ByteString})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``get_groups(df::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``set_groups(df::DataFrame, gr::Dict)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -See the Indexing section for these operations on column indexes. - -``colnames(df::DataFrame)`` or ``names(df::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The column names as an ``Array{ByteString}`` - -``setindex!(df::DataFrame, newcol, colname)`` or ``df[colname] = newcol`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Replace or add a new column with name ``colname`` and contents ``newcol``. -Arrays are converted to DataVector's. Values are recycled to match the -number of rows in ``df``. - -``insert!(df::DataFrame, index::Integer, item, name)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Insert a column of name ``name`` and with contents ``item`` into ``df`` at -position ``index``. - -``insert!(df::DataFrame, df2::DataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Insert columns of ``df2`` into ``df1``. - -``del!(df::DataFrame, cols)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Delete columns in ``df`` at positions given by ``cols`` (noted with any -means that columns can be referenced). - -``del(df::DataFrame, cols)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Nondestructive version. Return a DataFrame based on the columns in -``df`` after deleting columns specified by ``cols``. - -``deleterows!(df::DataFrame, inds)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Delete rows at positions specified by ``inds`` from the given DataFrame. - -``cbind(df1, df2, ...)`` or ``hcat(df1, df2, ...)`` or ``[df1 df2 ...]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Concatenate columns. Duplicated column names are adjusted. - -``rbind(df1, df2, ...)`` or ``vcat(df1, df2, ...)`` or ``[df1, df2, ...]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Concatenate rows. - -I/O ---- - -``csvDataFrame(filename, o::Options)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a DataFrame from file ``filename``. Options ``o`` include -``colnames`` (``"true"``, ``"false"``, or ``"check"`` (the default)) and -``poolstrings`` (``"check"`` (default) or ``"never"``). - -Expression/Function Evaluation in a DataFrame ---------------------------------------------- - -``with(df::AbstractDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Evaluate expression ``ex`` with the columns in ``df``. - -``within(df::AbstractDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a copy of ``df`` after evaluating expression ``ex`` with the -columns in ``df``. - -``within!(df::AbstractDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Modify ``df`` by evaluating expression ``ex`` with the columns in ``df``. - -``based_on(df::AbstractDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a new DataFrame based on evaluating expression ``ex`` with the -columns in ``df``. Often used for summarizing operations. - -``colwise(f::Function, df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``colwise(f::Vector{Function}, df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Apply ``f`` to each column of ``df``, and return the results as an -Array{Any}. - -``colwise(df::AbstractDataFrame, s::Symbol)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``colwise(df::AbstractDataFrame, s::Vector{Symbol})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Apply the function specified by Symbol ``s`` to each column of ``df``, and -return the results as a DataFrame. - -SubDataFrames -------------- - -``sub(df::DataFrame, r, c)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``sub(df::DataFrame, r)`` -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a SubDataFrame with references to rows and columns of ``df``. - - -``sub(sd::SubDataFrame, r, c)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``sub(sd::SubDataFrame, r)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a SubDataFrame with references to rows and columns of ``df``. - -``getindex(sd::SubDataFrame, r, c)`` or ``sd[r,c]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``getindex(sd::SubDataFrame, c)`` or ``sd[c]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Referencing should work the same as DataFrames. - - -Grouping --------- - -``groupby(df::AbstractDataFrame, cols)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a GroupedDataFrame based on unique groupings indicated by the -columns with one or more names given in ``cols``. - -``start(gd)``, ``done(gd,i)``, and ``next(gd,i)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Methods to iterate over GroupedDataFrame groupings. - -``getindex(gd::GroupedDataFrame, idx)`` or ``gd[idx]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Reference a particular grouping. Referencing returns a SubDataFrame. - -``with(gd::GroupedDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Evaluate expression ``ex`` with the columns in ``gd`` in each grouping. - -``within(gd::GroupedDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``within!(gd::GroupedDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a DataFrame with the results of evaluating expression ``ex`` with -the columns in ``gd`` in each grouping. - -``based_on(gd::GroupedDataFrame, ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Sweeps along groups and applies ``based_on`` to each group. Returns a -DataFrame. - -``map(f::Function, gd::GroupedDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Apply ``f`` to each grouping of ``gd`` and return the results in an Array. - -``colwise(f::Function, gd::GroupedDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``colwise(f::Vector{Function}, gd::GroupedDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Apply ``f`` to each column in each grouping of ``gd``, and return the -results as an Array{Any}. - -``colwise(gd::GroupedDataFrame, s::Symbol)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``colwise(gd::GroupedDataFrame, s::Vector{Symbol})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Apply the function specified by Symbol ``s`` to each column of in each -grouping of ``gd``, and return the results as a DataFrame. - -``by(df::AbstractDataFrame, cols, s::Symbol)`` or ``groupby(df, cols) |> s`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``by(df::AbstractDataFrame, cols, s::Vector{Symbol})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a DataFrame with the results of grouping on ``cols`` and -``colwise`` evaluation based on ``s``. Equivalent to ``colwise(groupby(df, -cols), s)``. - -``by(df::AbstractDataFrame, cols, e::Expr)`` or ``groupby(df, cols) |> e`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return a DataFrame with the results of grouping on ``cols`` and -evaluation of ``e`` in each grouping. Equivalent to ``based_on(groupby(df, -cols), e)``. - -Reshaping / Merge ------------------ - -``stack(df::DataFrame, cols)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For conversion from wide to long format. Returns a DataFrame with -stacked columns indicated by ``cols``. The result has column ``"key"`` -with column names from ``df`` and column ``"value"`` with the values from -``df``. Columns in ``df`` not included in ``cols`` are duplicated along the -stack. - -``unstack(df::DataFrame, ikey, ivalue, irefkey)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For conversion from long to wide format. Returns a DataFrame. ``ikey`` -indicates the key column--unique values in column ``ikey`` will be -column names in the result. ``ivalue`` indicates the value column. -``irefkey`` is the column with a unique identifier for that . Columns -not given by ``ikey``, ``ivalue``, or ``irefkey`` are currently ignored. - -``merge(df1::DataFrame, df2::DataFrame, bycol)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``merge(df1::DataFrame, df2::DataFrame, bycol, jointype)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Return the database join of ``df1`` and ``df2`` based on the column ``bycol``. -Currently only a single merge key is supported. Supports ``jointype`` of -"inner" (the default), "left", "right", or "outer". - - -Index -===== - -``Index()`` -^^^^^^^^^^^ -``Index(s::Vector{ByteString})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An Index with names ``s``. An Index is like an Associative type. An -Index is used for column indexing of DataFrames. An Index maps -ByteStrings and Vector{ByteStrings} to Indices. - -``length(x::Index)``, ``copy(x::Index)``, ``has(x::Index, key)``, ``keys(x::Index)``, ``push!(x::Index, name)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Normal meanings. - -``del(x::Index, idx::Integer)``, ``del(x::Index, s::ByteString)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Delete the name ``s`` or name at position ``idx`` in ``x``. - -``names(x::Index)`` -^^^^^^^^^^^^^^^^^^^ - -A Vector{ByteString} with the names of ``x``. - -``names!(x::Index, nm::Vector{ByteString})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Set names ``nm`` in ``x``. - -``rename(x::Index, f::Function)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``rename(x::Index, nd::Associative)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``rename(x::Index, from::Vector, to::Vector)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Replace names in ``x``, by applying function ``f`` to each name, -by mapping old to new names with a dictionary (Associative), or using -``from`` and ``to`` vectors. - -``getindex(x::Index, idx)`` or ``x[idx]`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This does the mapping from name(s) to Indices (positions). ``idx`` may -be ByteString, Vector{ByteString}, Int, Vector{Int}, Range{Int}, -Vector{Bool}, AbstractDataVector{Bool}, or AbstractDataVector{Int}. - -``set_group(idx::Index, newgroup, names::Vector{ByteString})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Add a group to ``idx`` with name ``newgroup`` that includes the names in -the vector ``names``. - -``get_groups(idx::Index)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A Dict that maps the name of each group to the names in the group. - -``set_groups(idx::Index, gr::Dict)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Set groups in ``idx`` based on the mapping given by ``gr``. - - -Missing Values -============== - -Missing value behavior is implemented by instantiations of the ``AbstractDataVector`` -abstract type. - -``NA`` -^^^^^ - -A constant indicating a missing value. - -``isna(x)`` -^^^^^^^^^^^ - -Return a ``Bool`` or ``Array{Bool}`` (if ``x`` is an ``AbstractDataVector``) -that is ``true`` for elements with missing values. - -``nafilter(x)`` -^^^^^^^^^^^^^^^ - -Return a copy of ``x`` after removing missing values. - -``nareplace(x, val)`` -^^^^^^^^^^^^^^^^^^^^^ - -Return a copy of ``x`` after replacing missing values with ``val``. - -``naFilter(x)`` -^^^^^^^^^^^^^^^ - -Return an object based on ``x`` such that future operations like ``mean`` -will not include missing values. This can be an iterator or other -object. - -``naReplace(x, val)`` -^^^^^^^^^^^^^^^^^^^^^ - -Return an object based on ``x`` such that future operations like ``mean`` -will replace NAs with ``val``. - -``na(x)`` -^^^^^^^^^ - -Return an ``NA`` value appropriate for the type of ``x``. - -``nas(x, dim)`` -^^^^^^^^^^^^^^^ - -Return an object like ``x`` filled with ``NA`` values with size ``dim``. - - -DataVector's -============ - -``DataArray(x::Vector)`` -^^^^^^^^^^^^^^^^^^^^^^^^ -``DataArray(x::Vector, m::Vector{Bool})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Create a DataVector from ``x``, with ``m`` optionally indicating which values -are NA. DataVector's are like Julia Vectors with support for NA's. ``x`` may -be any type of Vector. - -``PooledDataArray(x::Vector)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``PooledDataArray(x::Vector, m::Vector{Bool})`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Create a PooledDataVector from ``x``, with ``m`` optionally indicating which -values are NA. PooledDataVector's contain a pool of values with references -to those values. This is useful in a similar manner to an R array of -factors. - -``size``, ``length``, ``ndims``, ``ref``, ``assign``, ``start``, ``next``, ``done`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All normal Vector operations including array referencing should work. - -``isna(x)``, ``nafilter(x)``, ``nareplace(x, val)``, ``naFilter(x)``, ``naReplace(x, val)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All NA-related methods are supported. - -Utilities -========= - -``cut(x::Vector, breaks::Vector)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Returns a PooledDataVector with length equal to ``x`` that divides values in ``x`` -based on the divisions given by ``breaks``. - -Formulas and Models -=================== - -``Formula(ex::Expr)`` -^^^^^^^^^^^^^^^^^^^^^ - -Return a Formula object based on ``ex``. Formulas are two-sided -expressions separated by ``~``, like ``:(y ~ w*x + z + i&v)``. - -``model_frame(f::Formula, d::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``model_frame(ex::Expr, d::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ModelFrame. - -``model_matrix(mf::ModelFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``model_matrix(f::Formula, d::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``model_matrix(ex::Expr, d::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ModelMatrix based on ``mf``, ``f`` and ``d``, or ``ex`` and ``d``. - -``lm(ex::Expr, df::AbstractDataFrame)`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Linear model results (type OLSResults) based on formula ``ex`` and ``df``. diff --git a/sphinxdoc/other/specification.rst b/sphinxdoc/other/specification.rst deleted file mode 100644 index 3cfee923e4..0000000000 --- a/sphinxdoc/other/specification.rst +++ /dev/null @@ -1,214 +0,0 @@ -******************** -Formal Specification -******************** - -DataFrames Data Structures -========================== - -* Type Definitions and Type Hierarchy -* Constructors -* Indexing (Refs / Assigns) -* Operators - - * Unary Operators: - - * ``+``, ``-``, ``!``, ``'`` - - * Elementary Unary Functions - - * ``abs``, ... - - * Binary Operators: - - * Arithmetic Operators: - - * Scalar Arithmetic: ``+``, ``-``, ``*``, ``/``, - * Array Arithmetic: ``+``, ``.+``, ``-``, ``.-``, ``.*``, ``./``, ``.^`` - - * Bit Operators: ``&``, ``|``, - * Comparison Operators: - - * Scalar Comparisons: ``==``, ``!=``, ``<``, ``<=``, ``>``, - * Array Comparisons: ``.==``, ``.!=``, ``.<``, ``.<=``, ``.>``, ``.>=`` - -* Container Operations -* Broadcasting / Recycling -* Type Promotion and Conversion -* String Representations -* IO -* Copying -* Properties - - * size - * length - * ndims - * eltype - -* Predicates -* Handling NA's -* Iteration -* Miscellaneous - -The NAtype -========== - -Behavior under Unary Operators ------------------------------- - -The unary operators - -Behavior under Unary Operators ------------------------------- - -The unary operators - -Behavior under Arithmetic Operators ------------------------------------ - -Constructors -============ - -* NA's - - * Constructor: ``NAtype()`` - * Const alias: ``NA`` - -* DataVector's - - * From (Vector, BitVector): ``DataArray([1, 2, 3], falses(3))`` - * From (Vector, Vector{Bool}): ``DataArray([1, 2, 3], [false, false, false])`` - * From (Vector): ``DataArray([1, 2, 3])`` - * From (BitVector, BitVector): ``DataArray(trues(3), falses(3))`` - * From (BitVector): ``DataArray(trues(3))`` - * From (Range1): ``DataArray(1:3)`` - * From (DataVector): ``DataArray(DataArray([1, 2, 3]))`` - * From (Type, Int): ``DataArray(Int, 3)`` - * From (Int): ``DataArray(3)`` (Type defaults to Float64) - * From (): ``DataArray()`` (Type defaults to Float64, length defaults to 0) - * Initialized with Float64 zeros: ``datazeros(3)`` - * Initialized with typed zeros: ``datazeros(Int, 3)`` - * Initialized with Float64 ones: ``dataones(3)`` - * Initialized with typed ones: ``dataones(Int, 3)`` - * Initialized with falses: ``datafalses(3)`` - * Initialized with trues: ``datatrues(3)`` - * Literal syntax: ``DataVector[1, 2, NA]`` - -* PooledDataVector's - - * From (Vector, BitVector): ``PooledDataArray([1, 2, 3], falses(3))`` - * From (Vector, Vector{Bool}): ``PooledDataArray([1, 2, 3], [false, false, false])`` - * From (Vector): ``PooledDataArray([1, 2, 3])`` - * From (BitVector, BitVector): ``PooledDataArray(trues(3), falses(3))`` - * From (BitVector, Vector{Bool}): ``PooledDataArray(trues(3), [false, false, false])`` - * From (BitVector): ``PooledDataArray(trues(3))`` - * From (Range1): ``PooledDataArray(1:3)`` - * From (DataVector): ``PooledDataArray(DataArray([1, 2, 3]))`` - * From (Type, Int): ``PooledDataArray(Int, 3)`` - * From (Int): ``PooledDataArray(3)`` (Type defaults to Float64) - * From (): ``PooledDataArray()`` (Type defaults to Float64, length defaults to 0) - * Initialized with Float64 zeros: ``pdatazeros(3)`` - * Initialized with typed zeros: ``pdatazeros(Int, 3)`` - * Initialized with Float64 ones: ``pdataones(3)`` - * Initialized with typed ones: ``pdataones(Int, 3)`` - * Initialized with falses: ``pdatafalses(3)`` - * Initialized with trues: ``pdatatrues(3)`` - * Literal syntax: ``PooledDataVector[1, 2, NA]`` - -* DataMatrix - - * From (Array, BitArray): ``DataMatrix([1 2; 3 4], falses(2, 2))`` - * From (Array, Array{Bool}): ``DataMatrix([1 2; 3 4], [false false; false false])`` - * From (Array): ``DataMatrix([1 2; 3 4])`` - * From (BitArray, BitArray): ``DataMatrix(trues(2, 2), falses(2, 2))`` - * From (BitArray): ``DataMatrix(trues(2, 2))`` - * From (DataVector...): ``DataMatrix(DataVector[1, NA], DataVector[NA, 2])`` - * From (Range1...): ``DataMatrix(1:3, 1:3)`` - * From (DataMatrix): ``DataMatrix(DataArray([1 2; 3 4]))`` - * From (Type, Int, Int): ``DataMatrix(Int, 2, 2)`` - * From (Int, Int): ``DataMatrix(2, 2)`` (Type defaults to Float64) - * From (): ``DataMatrix()`` (Type defaults to Float64, length defaults to (0, 0)) - * Initialized with Float64 zeros: ``dmzeros(2, 2)`` - * Initialized with typed zeros: ``dmzeros(Int, 2, 2)`` - * Initialized with Float64 ones: ``dmones(2, 2)`` - * Initialized with typed ones: ``dmones(Int, 2, 2)`` - * Initialized with falses: ``dmfalses(2, 2)`` - * Initialized with trues: ``dmtrues(2, 2)`` - * Initialized identity matrix: ``dmeye(2, 2)`` - * Initialized identity matrix: ``dmeye(2)`` - * Initialized diagonal matrix: ``dmdiagm([2, 1])`` - * Literal syntax: ``DataMatrix[1 2; NA 2]`` - -* DataFrame - - * From (): ``DataFrame()`` - * From (Vector{Any}, Index): ``DataFrame({datazeros(3), dataones(3)}, Index(["A", "B"]))`` - * From (Vector{Any}): ``DataFrame({datazeros(3), dataones(3)})`` - * From (Expr): ``DataFrame(quote A = [1, 2, 3, 4] end)`` - * From (Matrix, Vector{String}): ``DataFrame([1 2; 3 4], ["A", "B"])`` - * From (Matrix): ``DataFrame([1 2; 3 4])`` - * From (Tuple): ``DataFrame(dataones(2), datafalses(2))`` - * From (Associative): ??? - * From (Vector, Vector, Groupings): ??? - * From (Dict of Vectors): ``DataFrame({"A" => [1, 3], "B" => [2, 4]})`` - * From (Dict of Vectors, Vector{String}): ``DataFrame({"A" => [1, 3], "B" => [2, 4]}, ["A"])`` - * From (Type, Int, Int): ``DataFrame(Int, 2, 2)`` - * From (Int, Int): ``DataFrame(2, 2)`` - * From (Vector{Types}, Vector{String}, Int): ``DataFrame({Int, Float64}, ["A", "B"], 2)`` - * From (Vector{Types}, Int): ``DataFrame({Int, Float64}, 2)`` - -Indexing -======== - -Types on indices:: - - NA - - dv = datazeros(10) - - dv[1] - - dv[1:2] - - dv[:] - - dv[[1, 2 3]] - - dv[[false, false, true, false, false]] - - dmzeros(10) - -Indexers: Int, Range, Colon, Vector{Int}, Vector{Bool}, String, Vector{String} - -DataVector's and PooledDataVector's implement: - -* Int -* Range -* Colon -* Vector{Int} -* Vector{Bool} - -DataMatrix's implement the Cartesian product: - -* Int, Int -* Int, Range -* Int, Colon -* Int, Vector{Int} -* Int, Vector{Bool}... -* Vector{Bool}, Int -* Vector{Bool}, Range -* Vector{Bool}, Colon -* Vector{Bool}, Vector{Int} -* Vector{Bool}, Vector{Bool} - -Single Int access? - -DataFrame's add two new indexer types: - -* String -* Vector{String} - -These can only occur as (a) the only indexer or (b) in the second slot of a paired indexer - -Anything that can be getindex()'d can also be setindex!()'d - -Where do we allow Expr indexing? diff --git a/sphinxdoc/source/subsets.rst b/sphinxdoc/source/subsets.rst index e89647fde7..d25eab10f9 100644 --- a/sphinxdoc/source/subsets.rst +++ b/sphinxdoc/source/subsets.rst @@ -55,18 +55,18 @@ because columns can be referred to by name:: julia> df = DataFrame(A = 1:10, B = 2:2:20) 10x2 DataFrame - | Row # | A | B | - |-------|----|----| - | 1 | 1 | 2 | - | 2 | 2 | 4 | - | 3 | 3 | 6 | - | 4 | 4 | 8 | - | 5 | 5 | 10 | - | 6 | 6 | 12 | - | 7 | 7 | 14 | - | 8 | 8 | 16 | - | 9 | 9 | 18 | - | 10 | 10 | 20 | + | Row | A | B | + |-----|----|----| + | 1 | 1 | 2 | + | 2 | 2 | 4 | + | 3 | 3 | 6 | + | 4 | 4 | 8 | + | 5 | 5 | 10 | + | 6 | 6 | 12 | + | 7 | 7 | 14 | + | 8 | 8 | 16 | + | 9 | 9 | 18 | + | 10 | 10 | 20 | Refering to the first column by index or name:: @@ -109,43 +109,43 @@ Selecting a subset of rows by index and an (ordered) subset of columns by name:: julia> df[1:3, [:A, :B]] 3x2 DataFrame - | Row # | A | B | - |-------|---|---| - | 1 | 1 | 2 | - | 2 | 2 | 4 | - | 3 | 3 | 6 | + | Row | A | B | + |-----|---|---| + | 1 | 1 | 2 | + | 2 | 2 | 4 | + | 3 | 3 | 6 | julia> df[1:3, [:B, :A]] 3x2 DataFrame - | Row # | B | A | - |-------|---|---| - | 1 | 2 | 1 | - | 2 | 4 | 2 | - | 3 | 6 | 3 | + | Row | B | A | + |-----|---|---| + | 1 | 2 | 1 | + | 2 | 4 | 2 | + | 3 | 6 | 3 | Selecting a subset of rows by using a condition:: julia> df[df[:A] % 2 .== 0, :] 5x2 DataFrame - | Row # | A | B | - |-------|----|----| - | 1 | 2 | 4 | - | 2 | 4 | 8 | - | 3 | 6 | 12 | - | 4 | 8 | 16 | - | 5 | 10 | 20 | + | Row | A | B | + |-----|----|----| + | 1 | 2 | 4 | + | 2 | 4 | 8 | + | 3 | 6 | 12 | + | 4 | 8 | 16 | + | 5 | 10 | 20 | julia> df[df[:B] % 2 .== 0, :] 10x2 DataFrame - | Row # | A | B | - |-------|----|----| - | 1 | 1 | 2 | - | 2 | 2 | 4 | - | 3 | 3 | 6 | - | 4 | 4 | 8 | - | 5 | 5 | 10 | - | 6 | 6 | 12 | - | 7 | 7 | 14 | - | 8 | 8 | 16 | - | 9 | 9 | 18 | - | 10 | 10 | 20 | + | Row | A | B | + |-----|----|----| + | 1 | 1 | 2 | + | 2 | 2 | 4 | + | 3 | 3 | 6 | + | 4 | 4 | 8 | + | 5 | 5 | 10 | + | 6 | 6 | 12 | + | 7 | 7 | 14 | + | 8 | 8 | 16 | + | 9 | 9 | 18 | + | 10 | 10 | 20 | From fa1c2a771310891dab775b2e5effd02f031b0b5b Mon Sep 17 00:00:00 2001 From: Sean Garborg Date: Tue, 25 Nov 2014 21:31:52 -0700 Subject: [PATCH 2/4] Tidy display code --- src/abstractdataframe/reshape.jl | 2 +- src/abstractdataframe/show.jl | 20 -------------------- src/dataframe/dataframe.jl | 9 +++------ src/dataframerow/show.jl | 18 ------------------ test/show.jl | 10 +++++++++- test/utils.jl | 2 +- 6 files changed, 14 insertions(+), 47 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index fb8f3c2fc8..6c610e5b38 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -56,7 +56,7 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int) i = int(refkeycol.refs[k]) if i > 0 && j > 0 if nowarning && !isna(payload[j][i]) - println("Warning: duplicate entries in unstack.") + warn("Duplicate entries in unstack.") nowarning = false end payload[j][i] = valuecol[k] diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index b46f07c0d2..c6dc4f1a32 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -574,23 +574,3 @@ function showcols(io::IO, df::AbstractDataFrame) # -> Nothing end showcols(df::AbstractDataFrame) = showcols(STDOUT, df) # -> Nothing - -#' @exported -#' @description -#' -#' Print an AbstractDataFrame to an IO system with an added newline. -#' -#' @param io::IO The `io` system to be rendered to. -#' @param df::AbstractDataFrame An AbstractDataFrame. -#' -#' @returns o::Nothing A `nothing` value. -#' -#' @examples -#' -#' df = DataFrame(A = 1:3, B = ["x", "y", "z"]) -#' print(STDOUT, df) -# TODO: Determine if this method is strictly necessary. -function Base.print(io::IO, df::AbstractDataFrame) - show(io, df) - print(io, '\n') -end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index d81e465da3..195171ed27 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -275,8 +275,7 @@ function insert_single_column!(df::DataFrame, push!(index(df), nextcolname(df)) push!(df.columns, dv) else - println("Column does not exist: $col_ind") - error("Cannot assign to non-existent column") + error("Cannot assign to non-existent column: $col_ind") end end end @@ -288,8 +287,7 @@ function insert_single_entry!(df::DataFrame, v::Any, row_ind::Real, col_ind::Col df.columns[index(df)[col_ind]][row_ind] = v return v else - println("Column does not exist: $col_ind") - error("Cannot assign to non-existent column") + error("Cannot assign to non-existent column: $col_ind") end end @@ -301,8 +299,7 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame, df.columns[index(df)[col_ind]][row_inds] = v return v else - println("Column does not exist: $col_ind") - error("Cannot assign to non-existent column") + error("Cannot assign to non-existent column: $col_ind") end end diff --git a/src/dataframerow/show.jl b/src/dataframerow/show.jl index 45de5130f4..c3b83a1cb3 100644 --- a/src/dataframerow/show.jl +++ b/src/dataframerow/show.jl @@ -22,21 +22,3 @@ function Base.show(io::IO, r::DataFrameRow) println(io, rpad(label, labelwidth, ' '), value) end end - -#' @exported -#' @description -#' -#' Render a DataFrameRow to STDOUT. See other `show` documentation for -#' details. -#' -#' @param r::DataFrameRow The DataFrameRow to be rendered to `io`. -#' -#' @returns o::Nothing A `nothing` value. -#' -#' @examples -#' -#' df = DataFrame(A = 1:3, B = ["x", "y", "z"]) -#' for r in eachrow(df) -#' show(r) -#' end -Base.show(r::DataFrameRow) = show(STDOUT, r) diff --git a/test/show.jl b/test/show.jl index 2141d366e1..e668640f5e 100644 --- a/test/show.jl +++ b/test/show.jl @@ -8,12 +8,20 @@ module TestShow showall(io, df) showall(io, df, true) - subdf = df[df[:A] .> 1.0, :] + subdf = sub(df, [2, 3]) # df[df[:A] .> 1.0, :] show(io, subdf) show(io, subdf, true) showall(io, subdf) showall(io, subdf, true) + dfvec = DataFrame[df for _=1:3] + show(io, dfvec) + showall(io, dfvec) + + gd = groupby(df, :A) + show(io, gd) + showall(io, gd) + dfr = DataFrameRow(df, 1) show(io, dfr) diff --git a/test/utils.jl b/test/utils.jl index dfb2b0d38c..acf7439d95 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -27,7 +27,7 @@ module TestUtils @test rw == DataFrames.RESERVED_WORDS end else - warn("Unable to find validate reserved words against parser. ", + warn("Unable to validate reserved words against parser. ", "Expected if Julia was not built from source.") end From f2381277a5a829c95ec96937b864fc44447154a8 Mon Sep 17 00:00:00 2001 From: Sean Garborg Date: Wed, 26 Nov 2014 15:25:22 -0700 Subject: [PATCH 3/4] Cheapen index creation, speed up symbol column lookup If columns groups come back, at least restrict storage to Int (!Real) --- src/other/index.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/other/index.jl b/src/other/index.jl index 00108ffdab..28c53d5fe2 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -3,20 +3,18 @@ # through cleanly. # an Index is the usual implementation. # a SimpleIndex only works if the things are integer indexes, which is weird. -typealias Indices Union(Real, AbstractVector{Real}) - abstract AbstractIndex type Index <: AbstractIndex # an OrderedDict would be nice here... - lookup::Dict{Symbol, Indices} # name => names array position + lookup::Dict{Symbol, Int} # name => names array position names::Vector{Symbol} end function Index(names::Vector{Symbol}) u = make_unique(names) - lookup = Dict{Symbol, Indices}(zip(u, 1:length(u))) + lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) Index(lookup, u) end -Index() = Index(Dict{Symbol, Indices}(), Symbol[]) +Index() = Index(Dict{Symbol, Int}(), Symbol[]) Base.length(x::Index) = length(x.names) Base.names(x::Index) = copy(x.names) Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names)) From 79e3217dd8855575bcf6f16a87222fafa37919f3 Mon Sep 17 00:00:00 2001 From: Sean Garborg Date: Wed, 26 Nov 2014 16:11:48 -0700 Subject: [PATCH 4/4] Fix stack overflows, etc., in stack and stackdf, of all names --- src/abstractdataframe/reshape.jl | 15 ++++++++++----- src/other/utils.jl | 10 ++++++++++ test/data.jl | 2 ++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 6c610e5b38..d4a9ac3a47 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -6,6 +6,8 @@ ## ############################################################################## +typealias Ints Union(Int, Vector{Int}) + ############################################################################## ## ## stack() @@ -13,7 +15,7 @@ ## ############################################################################## -function stack(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int}) +function stack(df::AbstractDataFrame, measure_vars::Ints, id_vars::Ints) res = DataFrame[insert!(df[[i, id_vars]], 1, names(df)[i], :variable) for i in measure_vars] # fix column names map(x -> names!(x, [:variable, :value, names(df[id_vars])]), res) @@ -80,7 +82,7 @@ unstack(df::AbstractDataFrame, rowkey, colkey, value) = # - can't have zero rows or zero columns # - the resulting data part is Float64 (`payload` below) -function pivottable(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, value::Int, fun::Function) +function pivottable(df::AbstractDataFrame, rows::Ints, cols::Ints, value::Int, fun::Function) # `rows` vector indicating which columns are keys placed in rows # `cols` vector indicating which columns are keys placed as column headers # `value` integer indicating which column has values @@ -99,13 +101,13 @@ function pivottable(df::AbstractDataFrame, rows::Vector{Int}, cols::Vector{Int}, payload[row_idxs[i], col_idxs[i]] = cmb_df[i, :x1] end # find the "row" key DataFrame - g = groupby(cmb_df[[1:length(rows)]], [1:length(rows)]) + g = groupby(cmb_df[1:length(rows)], 1:length(rows)) row_key_df = g.parent[g.idx[g.starts], :] hcat!(row_key_df, payload) end # `mean` is the default aggregation function: pivottable(df::AbstractDataFrame, rows, cols, value) = pivottable(df, rows, cols, value, mean) -pivottable(df::AbstractDataFrame, rows, cols, value, fun) = pivottable(df, [index(df)[rows]], [index(df)[cols]], index(df)[value], fun) +pivottable(df::AbstractDataFrame, rows, cols, value, fun) = pivottable(df, index(df)[rows], index(df)[cols], index(df)[value], fun) pivottable(fun::Function, df::AbstractDataFrame, rows, cols, value) = pivottable(df, rows, cols, value, fun) function paste_columns(df::AbstractDataFrame, sep) @@ -235,7 +237,7 @@ end # Same as `stack`, but uses references # I'm not sure the name is very good -function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vector{Int}) +function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Ints) N = length(measure_vars) cnames = names(df)[id_vars] insert!(cnames, 1, "value") @@ -246,6 +248,9 @@ function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int}, id_vars::Vect [RepeatedVector(df[:,c], N) for c in id_vars]...], # id_var columns cnames) end +function stackdf(df::AbstractDataFrame, measure_vars::Int, id_vars) + stackdf(df, [measure_vars], id_vars) +end function stackdf(df::AbstractDataFrame, measure_vars, id_vars) stackdf(df, index(df)[measure_vars], index(df)[id_vars]) end diff --git a/src/other/utils.jl b/src/other/utils.jl index 2000928cda..7b4038a2b1 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -177,6 +177,16 @@ function _setdiff{T}(a::AbstractVector{T}, b::AbstractVector{T}) end diff end +# because unions and parametric types don't compose, yet +function _setdiff{T}(a::AbstractVector{T}, b::T) + diff = T[] + for val in a + if !(val in b) + push!(diff, val) + end + end + diff +end function _uniqueofsorted(x::Vector) idx = fill(true, length(x)) diff --git a/test/data.jl b/test/data.jl index 652fcda5d5..96b1118226 100644 --- a/test/data.jl +++ b/test/data.jl @@ -140,6 +140,7 @@ module TestData c = randn(12), d = randn(12)) + stack(d1, :a) d1s = stack(d1, [:a, :b]) d1s2 = stack(d1, [:c, :d]) d1m = melt(d1, [:c, :d]) @@ -148,6 +149,7 @@ module TestData @test names(d1s) == [:variable, :value, :c, :d] @test isequal(d1s, d1m) + stackdf(d1, :a) d1s_df = stackdf(d1, [:a, :b]) d1m_df = meltdf(d1, [:c, :d]) @test isequal(d1s[:variable], d1s_df[:variable][:])