diff --git a/guide/expressions.ml b/guide/expressions.ml index 668499d5..41f33537 100644 --- a/guide/expressions.ml +++ b/guide/expressions.ml @@ -561,11 +561,11 @@ let%expect_test "Casting" = let%expect_test "Aggregation" = let schema = Schema.create - [ "first_name", Utf8 - ; "gender", Utf8 - ; "type", Utf8 - ; "state", Utf8 - ; "party", Utf8 + [ "first_name", Categorical None + ; "gender", Categorical None + ; "type", Categorical None + ; "state", Categorical None + ; "party", Categorical None ; "birthday", Date ] in @@ -582,7 +582,7 @@ let%expect_test "Aggregation" = ┌────────────┬────────────┬────────────┬────────┬───┬───────────┬───────────┬──────────┬───────────┐ │ last_name ┆ first_name ┆ middle_nam ┆ suffix ┆ … ┆ ballotped ┆ washingto ┆ icpsr_id ┆ wikipedia │ │ --- ┆ --- ┆ e ┆ --- ┆ ┆ ia_id ┆ n_post_id ┆ --- ┆ _id │ - │ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ i64 ┆ --- │ + │ str ┆ cat ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ i64 ┆ --- │ │ ┆ ┆ str ┆ ┆ ┆ str ┆ str ┆ ┆ str │ ╞════════════╪════════════╪════════════╪════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡ │ Bassett ┆ Richard ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 507 ┆ Richard │ @@ -627,7 +627,7 @@ let%expect_test "Aggregation" = ┌────────────┬───────┬───────────────────┬───────────┐ │ first_name ┆ count ┆ gender ┆ last_name │ │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ u32 ┆ list[str] ┆ str │ + │ cat ┆ u32 ┆ list[cat] ┆ str │ ╞════════════╪═══════╪═══════════════════╪═══════════╡ │ John ┆ 1256 ┆ ["M", "M", … "M"] ┆ Walker │ │ William ┆ 1022 ┆ ["M", "M", … "M"] ┆ Few │ @@ -655,7 +655,7 @@ let%expect_test "Aggregation" = ┌───────┬──────┬─────┐ │ state ┆ anti ┆ pro │ │ --- ┆ --- ┆ --- │ - │ str ┆ u32 ┆ u32 │ + │ cat ┆ u32 ┆ u32 │ ╞═══════╪══════╪═════╡ │ NJ ┆ 0 ┆ 3 │ │ CT ┆ 0 ┆ 3 │ @@ -684,7 +684,7 @@ let%expect_test "Aggregation" = ┌───────┬─────────────────────┬───────┐ │ state ┆ party ┆ count │ │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ u32 │ + │ cat ┆ cat ┆ u32 │ ╞═══════╪═════════════════════╪═══════╡ │ NJ ┆ Pro-Administration ┆ 3 │ │ VA ┆ Anti-Administration ┆ 3 │ @@ -720,7 +720,7 @@ let%expect_test "Aggregation" = ┌───────┬────────────────┬────────────────┬────────┬──────────┐ │ state ┆ avg M birthday ┆ avg F birthday ┆ # male ┆ # female │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ u32 ┆ u32 │ + │ cat ┆ f64 ┆ f64 ┆ u32 ┆ u32 │ ╞═══════╪════════════════╪════════════════╪════════╪══════════╡ │ DE ┆ 182.593407 ┆ null ┆ 97 ┆ 0 │ │ VA ┆ 192.542781 ┆ 66.2 ┆ 430 ┆ 5 │ @@ -749,7 +749,7 @@ let%expect_test "Aggregation" = ┌───────┬──────────────────┬───────────────────────┐ │ state ┆ youngest ┆ oldest │ │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str │ + │ cat ┆ str ┆ str │ ╞═══════╪══════════════════╪═══════════════════════╡ │ NC ┆ Madison Cawthorn ┆ John Ashe │ │ IA ┆ Abby Finkenauer ┆ Bernhart Henn │ @@ -778,7 +778,7 @@ let%expect_test "Aggregation" = ┌───────┬──────────────────┬───────────────────────┬────────────────────┐ │ state ┆ youngest ┆ oldest ┆ alphabetical_first │ │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ + │ cat ┆ str ┆ str ┆ str │ ╞═══════╪══════════════════╪═══════════════════════╪════════════════════╡ │ NC ┆ Madison Cawthorn ┆ John Ashe ┆ Abraham Rencher │ │ IA ┆ Abby Finkenauer ┆ Bernhart Henn ┆ Abby Finkenauer │ @@ -797,11 +797,17 @@ let%expect_test "Aggregation" = ; get_person |> last |> alias ~name:"oldest" ; get_person |> sort |> first |> alias ~name:"alphabetical_first" ; col "gender" - |> sort_by ~by:[ col "first_name" ] + |> sort_by + ~by: + [ (* The guide uses "first_name" to sort by, but I'm guessing + there's an nondeterminism bug causing output to be unstable + if we have multiple sorts or something, so I suspect + [get_person] is what we actually want *) + get_person + ] |> first |> alias ~name:"gender" ] - |> Lazy_frame.sort ~by_column:"state" |> Lazy_frame.limit ~n:5 |> Lazy_frame.collect_exn in @@ -809,17 +815,17 @@ let%expect_test "Aggregation" = [%expect {| shape: (5, 5) - ┌───────┬──────────────────┬────────────────┬────────────────────┬────────┐ - │ state ┆ youngest ┆ oldest ┆ alphabetical_first ┆ gender │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str ┆ str │ - ╞═══════╪══════════════════╪════════════════╪════════════════════╪════════╡ - │ AK ┆ Mark Begich ┆ Thomas Cale ┆ Anthony Dimond ┆ M │ - │ AL ┆ Martha Roby ┆ John McKee ┆ Albert Goodwyn ┆ M │ - │ AR ┆ Tim Griffin ┆ Archibald Yell ┆ Albert Rust ┆ M │ - │ AS ┆ Eni Faleomavaega ┆ Fofó Sunia ┆ Eni Faleomavaega ┆ M │ - │ AZ ┆ Ben Quayle ┆ Coles Bashford ┆ Ann Kirkpatrick ┆ F │ - └───────┴──────────────────┴────────────────┴────────────────────┴────────┘ |}] + ┌───────┬──────────────────┬───────────────────────┬────────────────────┬────────┐ + │ state ┆ youngest ┆ oldest ┆ alphabetical_first ┆ gender │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ cat ┆ str ┆ str ┆ str ┆ cat │ + ╞═══════╪══════════════════╪═══════════════════════╪════════════════════╪════════╡ + │ NC ┆ Madison Cawthorn ┆ John Ashe ┆ Abraham Rencher ┆ M │ + │ IA ┆ Abby Finkenauer ┆ Bernhart Henn ┆ Abby Finkenauer ┆ F │ + │ MI ┆ Peter Meijer ┆ Edward Bradley ┆ Aaron Bliss ┆ M │ + │ CA ┆ Katie Hill ┆ Edward Gilbert ┆ Aaron Sargent ┆ M │ + │ NY ┆ Mondaire Jones ┆ Cornelius Schoonmaker ┆ A. Foster ┆ M │ + └───────┴──────────────────┴───────────────────────┴────────────────────┴────────┘ |}] ;; (* Examples from https://pola-rs.github.io/polars-book/user-guide/expressions/null/ *) diff --git a/lib/data_type.ml b/lib/data_type.ml index a9c2d916..875e2d58 100644 --- a/lib/data_type.ml +++ b/lib/data_type.ml @@ -1,40 +1,38 @@ open! Core -module T = struct - type t = - | Boolean - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float32 - | Float64 - | Utf8 - | Binary - | Date - | Datetime of Time_unit.t * Tz.t option - | Duration of Time_unit.t - | Time - | List of t - (* We want this branch to be tested very well, since code dealing with - this recursive case is usually the most non-trivial portion of the - logic. *) - [@quickcheck.weight 10.] - | Null - | Struct of (string * t) list - | Unknown - [@@deriving compare, sexp, quickcheck] -end - -include T -include Sexpable.To_stringable (T) +type t = + | Boolean + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + | Utf8 + | Binary + | Date + | Datetime of Time_unit.t * Tz.t option + | Duration of Time_unit.t + | Time + | List of t + (* We want this branch to be tested very well, since code dealing with + this recursive case is usually the most non-trivial portion of the + logic. *) + [@quickcheck.weight 10.] + | Null + | Categorical of (Rev_mapping.t[@compare.ignore]) option [@quickcheck.do_not_generate] + | Struct of (string * t) list + | Unknown +[@@deriving compare, sexp_of, quickcheck] + +let to_string t = Sexp.to_string ([%sexp_of: t] t) module Typed = struct - type untyped = t [@@deriving compare, sexp, quickcheck] + type untyped = t [@@deriving compare, sexp_of, quickcheck] (* TODO: Consider mapping to smaller OCaml values like Int8, Float32, etc instead of casting up *) @@ -157,7 +155,7 @@ module Typed = struct | Duration time_unit -> Some (T (Duration time_unit)) | Time -> Some (T Time) | List t -> of_untyped t |> Option.map ~f:(fun (T t) -> T (List t)) - | Null | Struct _ | Unknown -> None + | Null | Categorical _ | Struct _ | Unknown -> None ;; let rec sexp_of_packed (T t) = diff --git a/lib/data_type.mli b/lib/data_type.mli index 2607d99c..f63fbfa4 100644 --- a/lib/data_type.mli +++ b/lib/data_type.mli @@ -20,11 +20,12 @@ type t = | Time | List of t | Null + | Categorical of Rev_mapping.t option | Struct of (string * t) list | Unknown -[@@deriving compare, sexp, quickcheck] +[@@deriving compare, sexp_of, quickcheck] -include Stringable.S with type t := t +val to_string : t -> string module Typed : sig type untyped diff --git a/lib/polars.ml b/lib/polars.ml index 82be8a64..c488343e 100644 --- a/lib/polars.ml +++ b/lib/polars.ml @@ -14,5 +14,6 @@ module Naive_time = Naive_time module Schema = Schema module Series = Series module Sql_context = Sql_context +module Rev_mapping = Rev_mapping module Time_unit = Time_unit module Tz = Tz diff --git a/lib/rev_mapping.ml b/lib/rev_mapping.ml new file mode 100644 index 00000000..1f02a534 --- /dev/null +++ b/lib/rev_mapping.ml @@ -0,0 +1,19 @@ +open! Core + +type t + +let quickcheck_shrinker = Base_quickcheck.Shrinker.atomic + +let quickcheck_observer = + Base_quickcheck.Observer.of_hash_fold (fun hash_state _t -> hash_state) +;; + +external get_categories : t -> string list = "rust_rev_mapping_get_categories" + +let sexp_of_t t = + get_categories t + |> (* It's difficult to get tests to be deterministic wrt the ordering of + categories, so we force them to be sorted here as a hacky workaround. *) + List.sort ~compare:String.compare + |> [%sexp_of: string list] +;; diff --git a/lib/rev_mapping.mli b/lib/rev_mapping.mli new file mode 100644 index 00000000..695889ea --- /dev/null +++ b/lib/rev_mapping.mli @@ -0,0 +1,7 @@ +open! Core + +type t [@@deriving sexp_of] + +val quickcheck_shrinker : t Quickcheck.Shrinker.t +val quickcheck_observer : t Quickcheck.Observer.t +val get_categories : t -> string list diff --git a/lib/schema.ml b/lib/schema.ml index 025fad5b..b293966d 100644 --- a/lib/schema.ml +++ b/lib/schema.ml @@ -6,4 +6,3 @@ external create : (string * Data_type.t) list -> t = "rust_schema_create" external to_fields : t -> (string * Data_type.t) list = "rust_schema_to_fields" let sexp_of_t t = to_fields t |> [%sexp_of: (string * Data_type.t) list] -let t_of_sexp sexp = [%of_sexp: (string * Data_type.t) list] sexp |> create diff --git a/lib/schema.mli b/lib/schema.mli index d171c84d..7814ccc8 100644 --- a/lib/schema.mli +++ b/lib/schema.mli @@ -1,6 +1,6 @@ open! Core -type t [@@deriving sexp] +type t [@@deriving sexp_of] val create : (string * Data_type.t) list -> t val to_fields : t -> (string * Data_type.t) list diff --git a/lib/series.ml b/lib/series.ml index 8109197a..269c7bcd 100644 --- a/lib/series.ml +++ b/lib/series.ml @@ -211,6 +211,15 @@ module T = struct map input_data_type output_data_type t ~f |> Result.ok_exn ;; + external cast + : t + -> to_:Data_type.t + -> strict:bool + -> (t, string) result + = "rust_series_cast" + + let cast ?(strict = true) t ~to_ = cast t ~to_ ~strict |> Utils.string_result_ok_exn + external name : t -> string = "rust_series_name" external rename : t -> name:string -> unit = "rust_series_rename" external dtype : t -> Data_type.t = "rust_series_dtype" diff --git a/lib/series.mli b/lib/series.mli index b45ed9e4..9f4dece5 100644 --- a/lib/series.mli +++ b/lib/series.mli @@ -75,6 +75,7 @@ val map -> f:('a option -> 'b option) -> t +val cast : ?strict:bool -> t -> to_:Data_type.t -> t val name : t -> string val rename : t -> name:string -> unit val dtype : t -> Data_type.t diff --git a/rust/polars-ocaml/Cargo.toml b/rust/polars-ocaml/Cargo.toml index 7b30a103..0981ddc8 100644 --- a/rust/polars-ocaml/Cargo.toml +++ b/rust/polars-ocaml/Cargo.toml @@ -35,6 +35,7 @@ features = [ "dtype-i8", "dtype-u16", "dtype-u8", + "dtype-categorical", "dynamic_groupby", "horizontal_concat", "interpolate", diff --git a/rust/polars-ocaml/src/expr.rs b/rust/polars-ocaml/src/expr.rs index 3cf36286..cfa60e33 100644 --- a/rust/polars-ocaml/src/expr.rs +++ b/rust/polars-ocaml/src/expr.rs @@ -5,7 +5,6 @@ use chrono::{Duration, NaiveDate, NaiveDateTime, NaiveTime}; use ocaml_interop::{ DynBox, OCaml, OCamlBytes, OCamlFloat, OCamlInt, OCamlList, OCamlRef, OCamlRuntime, ToOCaml, }; -use polars::lazy::dsl::GetOutput; use polars::prelude::*; use polars::series::IsSorted; use polars_ocaml_macros::ocaml_interop_export; diff --git a/rust/polars-ocaml/src/misc.rs b/rust/polars-ocaml/src/misc.rs index 055e1642..06b39a72 100644 --- a/rust/polars-ocaml/src/misc.rs +++ b/rust/polars-ocaml/src/misc.rs @@ -31,6 +31,17 @@ fn rust_schema_to_fields( fields.to_ocaml(cr) } +#[ocaml_interop_export] +fn rust_rev_mapping_get_categories( + cr: &mut &mut OCamlRuntime, + rev_mapping: OCamlRef>>, +) -> OCaml> { + let Abstract(rev_mapping) = rev_mapping.to_rust(cr); + let rev_mapping: Vec<_> = rev_mapping.get_categories().values_iter().collect(); + + rev_mapping.to_ocaml(cr) +} + #[ocaml_interop_export] fn rust_test_panic(cr: &mut &mut OCamlRuntime, error_message: OCamlRef) -> OCaml<()> { let error_message: String = error_message.to_rust(cr); diff --git a/rust/polars-ocaml/src/polars_types.rs b/rust/polars-ocaml/src/polars_types.rs index 6b02661c..69ed1a5d 100644 --- a/rust/polars-ocaml/src/polars_types.rs +++ b/rust/polars-ocaml/src/polars_types.rs @@ -5,8 +5,8 @@ use ocaml_interop::{ polymorphic_variant_tag_hash, DynBox, FromOCaml, OCaml, OCamlInt, OCamlList, OCamlRuntime, ToOCaml, }; +use polars::prelude::*; use polars::series::IsSorted; -use polars::{lazy::dsl::WindowMapping, prelude::*}; use smartstring::{LazyCompact, SmartString}; #[derive(Debug, Clone)] @@ -71,6 +71,10 @@ unsafe impl FromOCaml for PolarsDataType { DataType::List(Box::new(datatype)) }, DataType::Null, + DataType::Categorical(local_rev_mapping_opt: Option>>) => { + let local_rev_mapping_opt: Option>> = local_rev_mapping_opt; + DataType::Categorical(local_rev_mapping_opt.map(Abstract::get)) + }, DataType::Struct(fields: OCamlList<(String, DataType)>) => { let fields_: Vec<(String, PolarsDataType)> = fields; let fields: Vec = @@ -134,12 +138,16 @@ unsafe impl ToOCaml for PolarsDataType { ocaml_alloc_tagged_block!(cr, 2, datatype: DataType) } DataType::Null => ocaml_value(cr, 15), + DataType::Categorical(local_rev_mapping_opt) => { + let local_rev_mapping_opt = local_rev_mapping_opt.clone().map(Abstract); + ocaml_alloc_tagged_block!(cr, 3, local_rev_mapping_opt: Option>>) + } DataType::Struct(fields) => { let fields: Vec<(String, PolarsDataType)> = fields .iter() .map(|field| (field.name.to_string(), PolarsDataType(field.dtype.clone()))) .collect(); - ocaml_alloc_tagged_block!(cr, 3, fields: OCamlList<(String, DataType)>) + ocaml_alloc_tagged_block!(cr, 4, fields: OCamlList<(String, DataType)>) } DataType::Unknown => ocaml_value(cr, 16), } diff --git a/rust/polars-ocaml/src/series.rs b/rust/polars-ocaml/src/series.rs index 625f8907..3c3edfcf 100644 --- a/rust/polars-ocaml/src/series.rs +++ b/rust/polars-ocaml/src/series.rs @@ -924,6 +924,11 @@ fn rust_series_get( series_get(cr, &data_type, &series, index)?.to_ocaml(cr) } +enum SeriesMapError { + SeriesGetError(String), + FunctionCallError(BoxRoot), +} + #[ocaml_interop_export(raise_on_err)] fn rust_series_map( cr: &mut &mut OCamlRuntime, @@ -1029,9 +1034,25 @@ fn rust_series_map_idiomatic<'a>( Ok(ocaml_interop::alloc_ok(cr, &ocaml_series)) } -enum SeriesMapError { - SeriesGetError(String), - FunctionCallError(BoxRoot), +#[ocaml_interop_export] +fn rust_series_cast( + cr: &mut &mut OCamlRuntime, + series: OCamlRef>, + dtype: OCamlRef, + is_strict: OCamlRef, +) -> OCaml, String>> { + let PolarsDataType(dtype) = dtype.to_rust(cr); + let is_strict: bool = is_strict.to_rust(cr); + + dyn_box_result(cr, series, |series| { + let series = series.borrow(); + if is_strict { + series.strict_cast(&dtype) + } else { + series.cast(&dtype) + } + .map(|s| Rc::new(RefCell::new(s))) + }) } #[ocaml_interop_export] diff --git a/test/categorical_test.ml b/test/categorical_test.ml new file mode 100644 index 00000000..2a21b54b --- /dev/null +++ b/test/categorical_test.ml @@ -0,0 +1,102 @@ +open Core +open Polars + +let%expect_test "Rev_mapping.get_categories returns categories in order they are \ + encountered" + = + let s = + Series.stringo "" [ Some "foo"; None; Some "bar"; Some "ham" ] + |> Series.cast ~to_:(Categorical None) + in + let rev_mapping = + match Series.dtype s with + | Categorical (Some rev_mapping) -> rev_mapping + | _ -> failwith "unexpected" + in + Rev_mapping.get_categories rev_mapping |> [%sexp_of: string list] |> print_s; + [%expect {| (foo bar ham) |}] +;; + +let%expect_test "Test csv parsing with schema including categorical types" = + let schema = + Schema.create + [ "gender", Categorical None + ; "type", Categorical None + ; "state", Categorical None + ; "party", Categorical None + ; "birthday", Date + ] + in + let dataset = + Data_frame.read_csv_exn + ~schema + ~try_parse_dates:true + "../guide/data/legislators-historical.csv" + in + Data_frame.print dataset; + [%expect + {| + shape: (12_136, 36) + ┌────────────┬────────────┬────────────┬────────┬───┬───────────┬───────────┬──────────┬───────────┐ + │ last_name ┆ first_name ┆ middle_nam ┆ suffix ┆ … ┆ ballotped ┆ washingto ┆ icpsr_id ┆ wikipedia │ + │ --- ┆ --- ┆ e ┆ --- ┆ ┆ ia_id ┆ n_post_id ┆ --- ┆ _id │ + │ str ┆ str ┆ --- ┆ str ┆ ┆ --- ┆ --- ┆ i64 ┆ --- │ + │ ┆ ┆ str ┆ ┆ ┆ str ┆ str ┆ ┆ str │ + ╞════════════╪════════════╪════════════╪════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡ + │ Bassett ┆ Richard ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 507 ┆ Richard │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Bassett │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (Delaware │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ politi… │ + │ Bland ┆ Theodorick ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 786 ┆ Theodoric │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ k Bland │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ (congress │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ man) │ + │ Burke ┆ Aedanus ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1260 ┆ Aedanus │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Burke │ + │ Carroll ┆ Daniel ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 1538 ┆ Daniel │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Carroll │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ Flores ┆ Mayra ┆ null ┆ null ┆ … ┆ Mayra ┆ null ┆ null ┆ Mayra │ + │ ┆ ┆ ┆ ┆ ┆ Flores ┆ ┆ ┆ Flores │ + │ Sempolinsk ┆ Joseph ┆ null ┆ null ┆ … ┆ Joe Sempo ┆ null ┆ null ┆ Joe Sempo │ + │ i ┆ ┆ ┆ ┆ ┆ linski ┆ ┆ ┆ linski │ + │ Inhofe ┆ James ┆ M. ┆ null ┆ … ┆ Jim ┆ null ┆ 15424 ┆ Jim │ + │ ┆ ┆ ┆ ┆ ┆ Inhofe ┆ ┆ ┆ Inhofe │ + │ Sasse ┆ Benjamin ┆ Eric ┆ null ┆ … ┆ Ben Sasse ┆ null ┆ 41503 ┆ Ben Sasse │ + └────────────┴────────────┴────────────┴────────┴───┴───────────┴───────────┴──────────┴───────────┘ |}]; + Data_frame.schema dataset |> [%sexp_of: Schema.t] |> print_s; + [%expect + {| + ((last_name Utf8) (first_name Utf8) (middle_name Utf8) (suffix Utf8) + (nickname Utf8) (full_name Utf8) (birthday Date) + (gender (Categorical ((F M)))) (type (Categorical ((rep sen)))) + (state + (Categorical + ((AK AL AR AS AZ CA CO CT DC DE DK FL GA GU HI IA ID IL IN KS KY LA MA MD + ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OL OR PA PI PR RI SC SD + TN TX UT VA VI VT WA WI WV WY)))) + (district Int64) (senate_class Int64) + (party + (Categorical + ((Adams "Adams Democrat" American "American Labor" "Anti Jackson" + "Anti Jacksonian" "Anti Masonic" Anti-Administration Anti-Jacksonian + "Anti-Lecompton Democrat" Coalitionist Conservative + "Conservative Republican" "Constitutional Unionist" + "Crawford Republican" Democrat Democrat-Liberal "Democratic Republican" + Farmer-Labor Federalist "Free Silver" "Free Soil" "Ind. Democrat" + "Ind. Republican" "Ind. Republican-Democrat" "Ind. Whig" Independent + "Independent Democrat" Jackson "Jackson Republican" Jacksonian + "Law and Order" "Liberal Republican" Libertarian Liberty + "National Greenbacker" "New Progressive" Nonpartisan Nullifier + "Popular Democrat" Populist Pro-Administration Progressive + "Progressive Republican" Prohibitionist Readjuster "Readjuster Democrat" + Republican Republican-Conservative "Silver Republican" Socialist + "States Rights" "Unconditional Unionist" Union "Union Democrat" + "Union Labor" Unionist Unknown Whig)))) + (url Utf8) (address Utf8) (phone Utf8) (contact_form Utf8) (rss_url Utf8) + (twitter Utf8) (twitter_id Utf8) (facebook Utf8) (youtube Utf8) + (youtube_id Utf8) (mastodon Utf8) (bioguide_id Utf8) (thomas_id Utf8) + (opensecrets_id Utf8) (lis_id Utf8) (fec_ids Utf8) (cspan_id Utf8) + (govtrack_id Int64) (votesmart_id Utf8) (ballotpedia_id Utf8) + (washington_post_id Utf8) (icpsr_id Int64) (wikipedia_id Utf8)) |}] +;; diff --git a/test/schema_test.ml b/test/schema_test.ml index d18b9710..feaf89a5 100644 --- a/test/schema_test.ml +++ b/test/schema_test.ml @@ -17,6 +17,7 @@ let%expect_test "check serializations" = ; Float64 ; Utf8 ; Binary + ; Categorical None ; Date ] in @@ -41,7 +42,8 @@ let%expect_test "check serializations" = {| ((Boolean Boolean) (UInt8 UInt8) (UInt16 UInt16) (UInt32 UInt32) (UInt64 UInt64) (Int8 Int8) (Int16 Int16) (Int32 Int32) (Int64 Int64) - (Float32 Float32) (Float64 Float64) (Utf8 Utf8) (Binary Binary) (Date Date) + (Float32 Float32) (Float64 Float64) (Utf8 Utf8) (Binary Binary) + ("(Categorical())" (Categorical ())) (Date Date) ("(Datetime Nanoseconds())" (Datetime Nanoseconds ())) ("(Datetime Microseconds())" (Datetime Microseconds ())) ("(Datetime Milliseconds())" (Datetime Milliseconds ())) @@ -49,11 +51,11 @@ let%expect_test "check serializations" = ("(Duration Microseconds)" (Duration Microseconds)) ("(Duration Milliseconds)" (Duration Milliseconds)) (Time Time) ("(List Boolean)" (List Boolean)) (Null Null) - ("(Struct((Boolean Boolean)(UInt8 UInt8)(UInt16 UInt16)(UInt32 UInt32)(UInt64 UInt64)(Int8 Int8)(Int16 Int16)(Int32 Int32)(Int64 Int64)(Float32 Float32)(Float64 Float64)(Utf8 Utf8)(Binary Binary)(Date Date)))" + ("(Struct((Boolean Boolean)(UInt8 UInt8)(UInt16 UInt16)(UInt32 UInt32)(UInt64 UInt64)(Int8 Int8)(Int16 Int16)(Int32 Int32)(Int64 Int64)(Float32 Float32)(Float64 Float64)(Utf8 Utf8)(Binary Binary)(\"(Categorical())\"(Categorical()))(Date Date)))" (Struct ((Boolean Boolean) (UInt8 UInt8) (UInt16 UInt16) (UInt32 UInt32) (UInt64 UInt64) (Int8 Int8) (Int16 Int16) (Int32 Int32) (Int64 Int64) (Float32 Float32) (Float64 Float64) (Utf8 Utf8) (Binary Binary) - (Date Date)))) + ("(Categorical())" (Categorical ())) (Date Date)))) (Unknown Unknown)) |}] ;;