Skip to content

Commit

Permalink
Merge pull request #49 from aschaffer/andrei-pr
Browse files Browse the repository at this point in the history
[Review] Multi-column group-by and filters - sort-based single-aggregation
  • Loading branch information
mtjrider authored Jul 19, 2018
2 parents c6c6f95 + 9c0a162 commit 1c5936f
Show file tree
Hide file tree
Showing 12 changed files with 5,166 additions and 3 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ cuda_add_library(gdf SHARED
src/scan.cu
src/segmented_sorting.cu
src/datetimeops.cu
src/sqls_ops.cu
)


Expand Down
69 changes: 69 additions & 0 deletions include/gdf/cffi/functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -404,3 +404,72 @@ gdf_error gdf_max_f32(gdf_column *col, float *dev_result, gdf_size_type dev_resu
gdf_error gdf_max_i64(gdf_column *col, int64_t *dev_result, gdf_size_type dev_result_size);
gdf_error gdf_max_i32(gdf_column *col, int32_t *dev_result, gdf_size_type dev_result_size);
gdf_error gdf_max_i8(gdf_column *col, int8_t *dev_result, gdf_size_type dev_result_size);

/*
Multi-Column SQL ops:
WHERE (Filtering)
ORDER-BY
GROUP-BY
*/
gdf_error gdf_order_by(size_t nrows, //in: # rows
gdf_column* cols, //in: host-side array of gdf_columns
size_t ncols, //in: # cols
void** d_cols, //out: pre-allocated device-side array to be filled with gdf_column::data for each column; slicing of gdf_column array (host)
int* d_types, //out: pre-allocated device-side array to be filled with gdf_colum::dtype for each column; slicing of gdf_column array (host)
size_t* d_indx); //out: device-side array of re-rdered row indices

gdf_error gdf_filter(size_t nrows, //in: # rows
gdf_column* cols, //in: host-side array of gdf_columns
size_t ncols, //in: # cols
void** d_cols, //out: pre-allocated device-side array to be filled with gdf_column::data for each column; slicing of gdf_column array (host)
int* d_types, //out: pre-allocated device-side array to be filled with gdf_colum::dtype for each column; slicing of gdf_column array (host)
void** d_vals, //in: device-side array of values to filter against (type-erased)
size_t* d_indx, //out: device-side array of row indices that remain after filtering
size_t* new_sz); //out: host-side # rows that remain after filtering

gdf_error gdf_group_by_sum(int ncols, // # columns
gdf_column** cols, //input cols
gdf_column* col_agg, //column to aggregate on
gdf_column* out_col_indices, //if not null return indices of re-ordered rows
gdf_column** out_col_values, //if not null return the grouped-by columns
//(multi-gather based on indices, which are needed anyway)
gdf_column* out_col_agg, //aggregation result
gdf_context* ctxt); //struct with additional info: bool is_sorted, flag_sort_or_hash, bool flag_count_distinct

gdf_error gdf_group_by_min(int ncols, // # columns
gdf_column** cols, //input cols
gdf_column* col_agg, //column to aggregate on
gdf_column* out_col_indices, //if not null return indices of re-ordered rows
gdf_column** out_col_values, //if not null return the grouped-by columns
//(multi-gather based on indices, which are needed anyway)
gdf_column* out_col_agg, //aggregation result
gdf_context* ctxt); //struct with additional info: bool is_sorted, flag_sort_or_hash, bool flag_count_distinct


gdf_error gdf_group_by_max(int ncols, // # columns
gdf_column** cols, //input cols
gdf_column* col_agg, //column to aggregate on
gdf_column* out_col_indices, //if not null return indices of re-ordered rows
gdf_column** out_col_values, //if not null return the grouped-by columns
//(multi-gather based on indices, which are needed anyway)
gdf_column* out_col_agg, //aggregation result
gdf_context* ctxt); //struct with additional info: bool is_sorted, flag_sort_or_hash, bool flag_count_distinct


gdf_error gdf_group_by_avg(int ncols, // # columns
gdf_column** cols, //input cols
gdf_column* col_agg, //column to aggregate on
gdf_column* out_col_indices, //if not null return indices of re-ordered rows
gdf_column** out_col_values, //if not null return the grouped-by columns
//(multi-gather based on indices, which are needed anyway)
gdf_column* out_col_agg, //aggregation result
gdf_context* ctxt); //struct with additional info: bool is_sorted, flag_sort_or_hash, bool flag_count_distinct

gdf_error gdf_group_by_count(int ncols, // # columns
gdf_column** cols, //input cols
gdf_column* col_agg, //column to aggregate on
gdf_column* out_col_indices, //if not null return indices of re-ordered rows
gdf_column** out_col_values, //if not null return the grouped-by columns
//(multi-gather based on indices, which are needed anyway)
gdf_column* out_col_agg, //aggregation result
gdf_context* ctxt); //struct with additional info: bool is_sorted, flag_sort_or_hash, bool flag_count_distinct
29 changes: 26 additions & 3 deletions include/gdf/cffi/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ typedef enum {
GDF_INT64,
GDF_FLOAT32,
GDF_FLOAT64,
GDF_DATE32, // int32_t days since the UNIX epoch
GDF_DATE64, // int64_t milliseconds since the UNIX epoch
GDF_TIMESTAMP // Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond)
GDF_DATE32, // int32_t days since the UNIX epoch
GDF_DATE64, // int64_t milliseconds since the UNIX epoch
GDF_TIMESTAMP,// Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond)
N_GDF_TYPES, /* additional types should go BEFORE N_GDF_TYPES */
} gdf_dtype;

typedef enum {
Expand All @@ -21,10 +22,12 @@ typedef enum {
GDF_UNSUPPORTED_DTYPE,
GDF_COLUMN_SIZE_MISMATCH,
GDF_COLUMN_SIZE_TOO_BIG,
GDF_DATASET_EMPTY,
GDF_VALIDITY_MISSING,
GDF_VALIDITY_UNSUPPORTED,
GDF_JOIN_DTYPE_MISMATCH,
GDF_JOIN_TOO_MANY_COLUMNS,
GDF_UNSUPPORTED_METHOD,
} gdf_error;

typedef enum {
Expand All @@ -48,8 +51,28 @@ typedef struct gdf_column_{
gdf_dtype_extra_info dtype_info;
} gdf_column;

typedef enum {
GDF_SORT = 0,
GDF_HASH,
N_GDF_METHODS, /* additional methods should go BEFORE N_GDF_METHODS */
} gdf_method;

typedef enum {
GDF_SUM = 0,
GDF_MIN,
GDF_MAX,
GDF_AVG,
GDF_COUNT,
GDF_COUNT_DISTINCT,
N_GDF_AGG_OPS, /* additional aggregation ops should go BEFORE N_GDF_... */
} gdf_agg_op;

/* additonal flags */
typedef struct gdf_context_{
int flag_sorted; /* 0 = No, 1 = yes */
gdf_method flag_method; /* what method is used */
int flag_distinct; /* for COUNT: DISTINCT = 1, else = 0 */
} gdf_context;

struct _OpaqueIpcParser;
typedef struct _OpaqueIpcParser gdf_ipc_parser_type;
Expand Down
Loading

0 comments on commit 1c5936f

Please sign in to comment.