From 6a3b790a551ebcd4a3df2d448acdb5d65d7e1ffb Mon Sep 17 00:00:00 2001 From: Christian Melchior Date: Mon, 27 May 2024 12:42:01 +0200 Subject: [PATCH 1/3] KTNB-693 Send the full dataframe schema as metadata, so it can be used by AI actions. --- .../kotlinx/dataframe/impl/io/writeJson.kt | 53 +++++++++++++++---- .../dataframe/jupyter/JupyterHtmlRenderer.kt | 16 ++++-- .../dataframe/jupyter/RenderingTests.kt | 48 +++++++++++++++++ 3 files changed, 103 insertions(+), 14 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt index 4c8290a84d..afbfecd34d 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -3,15 +3,9 @@ package org.jetbrains.kotlinx.dataframe.impl.io import com.beust.klaxon.JsonArray import com.beust.klaxon.JsonObject import com.beust.klaxon.KlaxonJson -import org.jetbrains.kotlinx.dataframe.AnyCol -import org.jetbrains.kotlinx.dataframe.AnyFrame -import org.jetbrains.kotlinx.dataframe.ColumnsContainer -import org.jetbrains.kotlinx.dataframe.DataColumn -import org.jetbrains.kotlinx.dataframe.api.indices -import org.jetbrains.kotlinx.dataframe.api.isList +import org.jetbrains.kotlinx.dataframe.* +import org.jetbrains.kotlinx.dataframe.api.* import org.jetbrains.kotlinx.dataframe.api.name -import org.jetbrains.kotlinx.dataframe.api.rows -import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn @@ -22,13 +16,15 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.SCHEMA import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.arrayColumnName import org.jetbrains.kotlinx.dataframe.io.valueColumnName import org.jetbrains.kotlinx.dataframe.ncol import org.jetbrains.kotlinx.dataframe.nrow -import org.jetbrains.kotlinx.dataframe.typeClass +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.awt.image.BufferedImage import java.io.IOException @@ -53,9 +49,19 @@ internal object SerializationKeys { const val VERSION = "\$version" const val COLUMNS = "columns" const val KOTLIN_DATAFRAME = "kotlin_dataframe" + const val SCHEMA = "schema" } -internal const val SERIALIZATION_VERSION = "2.0.0" +/** + * Changes: + * 1.0.0: + * - + * 2.0.0: + * - + * 2.1.0: + * - Added "schema" entry to metadata + */ +internal const val SERIALIZATION_VERSION = "2.1.0" internal fun KlaxonJson.encodeRowWithMetadata( frame: ColumnsContainer<*>, @@ -256,6 +262,7 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( return obj( VERSION to SERIALIZATION_VERSION, METADATA to obj( + SCHEMA to frame.schema().toJson(), COLUMNS to frame.columnNames(), NROW to frame.rowsCount(), NCOL to frame.columnsCount() @@ -267,3 +274,29 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( ), ) } + +/** + * Turn a [DataFrameSchema] into a datastructure that can be parsed + * to a JSON serializer + * + * Each column is represented the following way: + * - value columns: `{name: "", kind: "ValueColumn", type: "" }` + * - colum groups: `{name: "", kind: "ColumnGroup", group: [,...] }` + * - data frames: `{name: "", kind: "FrameColumn", dataframe: [,...] }` + */ +internal fun DataFrameSchema.toJson(): MutableList> { + val list: MutableList> = mutableListOf() + columns.forEach { (name: String, columnSchema: ColumnSchema) -> + val schemaData = mutableMapOf() + schemaData["name"] = name + schemaData["kind"] = columnSchema.kind.toString() + when (columnSchema) { + is ColumnSchema.Value -> schemaData["type"] = columnSchema.type.toString() + is ColumnSchema.Group -> schemaData["group"] = columnSchema.schema.toJson() + is ColumnSchema.Frame -> schemaData["dataframe"] = columnSchema.schema.toJson() + else -> error("Unsupported column type: $columnSchema") + } + list.add(schemaData) + } + return list +} diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt index a5a4dc249b..a7534557a5 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt @@ -1,8 +1,15 @@ package org.jetbrains.kotlinx.dataframe.jupyter import com.beust.klaxon.json +import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.SCHEMA import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame +import org.jetbrains.kotlinx.dataframe.impl.io.toJson import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration @@ -69,10 +76,11 @@ internal inline fun JupyterHtmlRenderer.render( !ideBuildNumber.supportsDynamicNestedTables() -> { json { obj( - "nrow" to df.size.nrow, - "ncol" to df.size.ncol, - "columns" to df.columnNames(), - "kotlin_dataframe" to encodeFrame(df.take(limit)), + NROW to df.size.nrow, + NCOL to df.size.ncol, + SCHEMA to df.schema().toJson(), + COLUMNS to df.columnNames(), + KOTLIN_DATAFRAME to encodeFrame(df.take(limit)), ) }.toJsonString() } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt index a68f1ede3a..49f90aa46a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt @@ -4,6 +4,7 @@ import com.beust.klaxon.JsonArray import com.beust.klaxon.JsonObject import com.beust.klaxon.Parser import io.kotest.assertions.throwables.shouldNotThrow +import io.kotest.matchers.collections.shouldContain import io.kotest.matchers.comparables.shouldBeGreaterThan import io.kotest.matchers.comparables.shouldBeLessThan import io.kotest.matchers.shouldBe @@ -212,6 +213,53 @@ class RenderingTests : JupyterReplTestCase() { } } + @Test + fun `json metadata contains schema metadata`() { + val json = executeScriptAndParseDataframeResult( + """ + val col1 by columnOf("a", "b", "c") + val col2 by columnOf(1, 2, 3) + val col3 by columnOf("Foo", "Bar", null) + val df2 = dataFrameOf(Pair("header", listOf("A", "B", "C"))) + val col4 by columnOf(df2, df2, df2) + var df = dataFrameOf(col1, col2, col3, col4) + df.group(col1, col2).into("group") + """.trimIndent() + ) + json.keys shouldContain "metadata" + val metadata = json["metadata"] as JsonObject + metadata.keys shouldContain "schema" + val schema = (metadata["schema"] as JsonArray<*>).toJsonString(prettyPrint = true) + val expectedSchema = """ + [{ + "name": "group", + "kind": "ColumnGroup", + "group": [{ + "name": "col1", + "kind": "ValueColumn", + "type": "kotlin.String" + }, { + "name": "col2", + "kind": "ValueColumn", + "type": "kotlin.Int" + }] + }, { + "name": "col3", + "kind": "ValueColumn", + "type": "kotlin.String?" + }, { + "name": "col4", + "kind": "FrameColumn", + "dataframe": [{ + "name": "header", + "kind": "ValueColumn", + "type": "kotlin.String" + }] + }] + """.trimIndent() + schema shouldBe expectedSchema + } + @Test fun `test kotlin dataframe conversion groupby`() { val json = executeScriptAndParseDataframeResult( From 76969bda3631c745f1157ed285974319979a0d07 Mon Sep 17 00:00:00 2001 From: Christian Melchior Date: Tue, 28 May 2024 17:34:47 +0200 Subject: [PATCH 2/3] Move schema metadata into kotlin_dataframe elements --- .../kotlinx/dataframe/impl/io/writeJson.kt | 73 +++++----- .../dataframe/jupyter/JupyterHtmlRenderer.kt | 5 +- .../dataframe/jupyter/RenderingTests.kt | 127 ++++++++++++++---- 3 files changed, 133 insertions(+), 72 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt index afbfecd34d..3e6d8c9b85 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -16,13 +16,11 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW -import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.SCHEMA +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.arrayColumnName import org.jetbrains.kotlinx.dataframe.io.valueColumnName -import org.jetbrains.kotlinx.dataframe.ncol -import org.jetbrains.kotlinx.dataframe.nrow import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.awt.image.BufferedImage @@ -49,7 +47,7 @@ internal object SerializationKeys { const val VERSION = "\$version" const val COLUMNS = "columns" const val KOTLIN_DATAFRAME = "kotlin_dataframe" - const val SCHEMA = "schema" + const val TYPES = "types" } /** @@ -71,24 +69,47 @@ internal fun KlaxonJson.encodeRowWithMetadata( ): JsonObject? { val values = frame.columns().map { col -> when (col) { - is ColumnGroup<*> -> obj( - DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions), - METADATA to obj(KIND to ColumnKind.Group.toString()) - ) - + is ColumnGroup<*> -> { + val schema = col.schema() + obj( + DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions), + METADATA to obj( + KIND to ColumnKind.Group.toString(), + COLUMNS to schema.columns.keys, + TYPES to schema.columns.values.map { colSchema: ColumnSchema -> + when(colSchema.kind) { + ColumnKind.Value -> colSchema.type.toString() + ColumnKind.Group -> null + ColumnKind.Frame -> null + } + } + ), + ) + } is FrameColumn<*> -> { - val data = if (rowLimit == null) encodeFrameWithMetadata(col[index], null, imageEncodingOptions) - else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions) + val data = if (rowLimit == null) { + encodeFrameWithMetadata(col[index], null, imageEncodingOptions) + } else { + encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions) + } + val schema = col.schema.value obj( DATA to data, METADATA to obj( KIND to ColumnKind.Frame.toString(), + COLUMNS to schema.columns.keys, + TYPES to schema.columns.values.map { colSchema: ColumnSchema -> + when(colSchema.kind) { + ColumnKind.Value -> colSchema.type.toString() + ColumnKind.Group -> null + ColumnKind.Frame -> null + } + }, NCOL to col[index].ncol, NROW to col[index].nrow ) ) } - else -> encodeValue(col, index, imageEncodingOptions) }.let { col.name to it } } @@ -262,8 +283,8 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( return obj( VERSION to SERIALIZATION_VERSION, METADATA to obj( - SCHEMA to frame.schema().toJson(), COLUMNS to frame.columnNames(), + TYPES to frame.columnTypes().map { it.toString() }, NROW to frame.rowsCount(), NCOL to frame.columnsCount() ), @@ -274,29 +295,3 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( ), ) } - -/** - * Turn a [DataFrameSchema] into a datastructure that can be parsed - * to a JSON serializer - * - * Each column is represented the following way: - * - value columns: `{name: "", kind: "ValueColumn", type: "" }` - * - colum groups: `{name: "", kind: "ColumnGroup", group: [,...] }` - * - data frames: `{name: "", kind: "FrameColumn", dataframe: [,...] }` - */ -internal fun DataFrameSchema.toJson(): MutableList> { - val list: MutableList> = mutableListOf() - columns.forEach { (name: String, columnSchema: ColumnSchema) -> - val schemaData = mutableMapOf() - schemaData["name"] = name - schemaData["kind"] = columnSchema.kind.toString() - when (columnSchema) { - is ColumnSchema.Value -> schemaData["type"] = columnSchema.type.toString() - is ColumnSchema.Group -> schemaData["group"] = columnSchema.schema.toJson() - is ColumnSchema.Frame -> schemaData["dataframe"] = columnSchema.schema.toJson() - else -> error("Unsupported column type: $columnSchema") - } - list.add(schemaData) - } - return list -} diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt index a7534557a5..469dd634fa 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt @@ -1,15 +1,12 @@ package org.jetbrains.kotlinx.dataframe.jupyter import com.beust.klaxon.json -import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW -import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.SCHEMA import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame -import org.jetbrains.kotlinx.dataframe.impl.io.toJson import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration @@ -72,13 +69,13 @@ internal inline fun JupyterHtmlRenderer.render( if (notebook.kernelVersion >= KotlinKernelVersion.from(MIN_KERNEL_VERSION_FOR_NEW_TABLES_UI)!!) { val ideBuildNumber = KotlinNotebookPluginUtils.getKotlinNotebookIDEBuildNumber() + // TODO Do we need to handle the improved meta data here as well? val jsonEncodedDf = when { !ideBuildNumber.supportsDynamicNestedTables() -> { json { obj( NROW to df.size.nrow, NCOL to df.size.ncol, - SCHEMA to df.schema().toJson(), COLUMNS to df.columnNames(), KOTLIN_DATAFRAME to encodeFrame(df.take(limit)), ) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt index 49f90aa46a..6bbf78ae3b 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt @@ -226,38 +226,107 @@ class RenderingTests : JupyterReplTestCase() { df.group(col1, col2).into("group") """.trimIndent() ) - json.keys shouldContain "metadata" - val metadata = json["metadata"] as JsonObject - metadata.keys shouldContain "schema" - val schema = (metadata["schema"] as JsonArray<*>).toJsonString(prettyPrint = true) - val expectedSchema = """ - [{ - "name": "group", - "kind": "ColumnGroup", - "group": [{ - "name": "col1", - "kind": "ValueColumn", - "type": "kotlin.String" + val jsonOutput = json.toJsonString(prettyPrint = true) + val expectedOutput = """ + { + "${'$'}version": "2.1.0", + "metadata": { + "columns": ["group", "col3", "col4"], + "types": ["org.jetbrains.kotlinx.dataframe.DataRow<*>", "kotlin.String?", "org.jetbrains.kotlinx.dataframe.DataFrame<*>"], + "nrow": 3, + "ncol": 3 + }, + "kotlin_dataframe": [{ + "group": { + "data": { + "col1": "a", + "col2": 1 + }, + "metadata": { + "kind": "ColumnGroup", + "columns": ["col1", "col2"], + "types": ["kotlin.String", "kotlin.Int"] + } + }, + "col3": "Foo", + "col4": { + "data": [{ + "header": "A" + }, { + "header": "B" + }, { + "header": "C" + }], + "metadata": { + "kind": "FrameColumn", + "columns": ["header"], + "types": ["kotlin.String"], + "ncol": 1, + "nrow": 3 + } + } }, { - "name": "col2", - "kind": "ValueColumn", - "type": "kotlin.Int" - }] - }, { - "name": "col3", - "kind": "ValueColumn", - "type": "kotlin.String?" - }, { - "name": "col4", - "kind": "FrameColumn", - "dataframe": [{ - "name": "header", - "kind": "ValueColumn", - "type": "kotlin.String" + "group": { + "data": { + "col1": "b", + "col2": 2 + }, + "metadata": { + "kind": "ColumnGroup", + "columns": ["col1", "col2"], + "types": ["kotlin.String", "kotlin.Int"] + } + }, + "col3": "Bar", + "col4": { + "data": [{ + "header": "A" + }, { + "header": "B" + }, { + "header": "C" + }], + "metadata": { + "kind": "FrameColumn", + "columns": ["header"], + "types": ["kotlin.String"], + "ncol": 1, + "nrow": 3 + } + } + }, { + "group": { + "data": { + "col1": "c", + "col2": 3 + }, + "metadata": { + "kind": "ColumnGroup", + "columns": ["col1", "col2"], + "types": ["kotlin.String", "kotlin.Int"] + } + }, + "col3": null, + "col4": { + "data": [{ + "header": "A" + }, { + "header": "B" + }, { + "header": "C" + }], + "metadata": { + "kind": "FrameColumn", + "columns": ["header"], + "types": ["kotlin.String"], + "ncol": 1, + "nrow": 3 + } + } }] - }] + } """.trimIndent() - schema shouldBe expectedSchema + jsonOutput shouldBe expectedOutput } @Test From 2679f48d05f0d7d95068fdf743610193f101df91 Mon Sep 17 00:00:00 2001 From: Christian Melchior Date: Wed, 29 May 2024 15:54:03 +0200 Subject: [PATCH 3/3] Type information is now an object kind and type. It is added on the top-level frame as well as on each row. Updated serialization_format.md --- .../kotlinx/dataframe/impl/io/writeJson.kt | 56 +++++++++++++---- .../dataframe/jupyter/JupyterHtmlRenderer.kt | 12 ++-- .../kotlinx/dataframe/impl/io/writeJson.kt | 60 +++++++++++-------- .../dataframe/jupyter/RenderingTests.kt | 48 ++++++++++++--- docs/serialization_format.md | 38 ++++++++++-- 5 files changed, 163 insertions(+), 51 deletions(-) diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt index 4c8290a84d..2bff506bcd 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -9,8 +9,8 @@ import org.jetbrains.kotlinx.dataframe.ColumnsContainer import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.api.indices import org.jetbrains.kotlinx.dataframe.api.isList -import org.jetbrains.kotlinx.dataframe.api.name import org.jetbrains.kotlinx.dataframe.api.rows +import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup import org.jetbrains.kotlinx.dataframe.columns.ColumnKind @@ -22,12 +22,16 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPE +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.arrayColumnName import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.name import org.jetbrains.kotlinx.dataframe.ncol import org.jetbrains.kotlinx.dataframe.nrow +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import org.jetbrains.kotlinx.dataframe.typeClass import java.awt.image.BufferedImage import java.io.IOException @@ -53,9 +57,13 @@ internal object SerializationKeys { const val VERSION = "\$version" const val COLUMNS = "columns" const val KOTLIN_DATAFRAME = "kotlin_dataframe" + const val TYPE = "type" + const val TYPES = "types" } -internal const val SERIALIZATION_VERSION = "2.0.0" +// See docs/serialization_format.md for a description of +// serialization versions and format. +internal const val SERIALIZATION_VERSION = "2.1.0" internal fun KlaxonJson.encodeRowWithMetadata( frame: ColumnsContainer<*>, @@ -65,24 +73,39 @@ internal fun KlaxonJson.encodeRowWithMetadata( ): JsonObject? { val values = frame.columns().map { col -> when (col) { - is ColumnGroup<*> -> obj( - DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions), - METADATA to obj(KIND to ColumnKind.Group.toString()) - ) - + is ColumnGroup<*> -> { + val schema = col.schema() + obj( + DATA to encodeRowWithMetadata(col, index, rowLimit, imageEncodingOptions), + METADATA to obj( + KIND to ColumnKind.Group.toString(), + COLUMNS to schema.columns.keys, + TYPES to schema.columns.values.map { columnSchema -> + createJsonTypeDescriptor(columnSchema) + } + ), + ) + } is FrameColumn<*> -> { - val data = if (rowLimit == null) encodeFrameWithMetadata(col[index], null, imageEncodingOptions) - else encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions) + val data = if (rowLimit == null) { + encodeFrameWithMetadata(col[index], null, imageEncodingOptions) + } else { + encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, imageEncodingOptions) + } + val schema = col.schema.value obj( DATA to data, METADATA to obj( KIND to ColumnKind.Frame.toString(), + COLUMNS to schema.columns.keys, + TYPES to schema.columns.values.map { columnSchema -> + createJsonTypeDescriptor(columnSchema) + }, NCOL to col[index].ncol, NROW to col[index].nrow ) ) } - else -> encodeValue(col, index, imageEncodingOptions) }.let { col.name to it } } @@ -148,6 +171,16 @@ private fun encodeBufferedImageAsBase64( } } +private fun createJsonTypeDescriptor(columnSchema: ColumnSchema): JsonObject { + return JsonObject( + mutableMapOf(KIND to columnSchema.kind.toString()).also { + if (columnSchema.kind == ColumnKind.Value) { + it.put(TYPE, columnSchema.type.toString()) + } + } + ) +} + internal fun KlaxonJson.encodeFrameWithMetadata( frame: AnyFrame, rowLimit: Int? = null, @@ -257,6 +290,9 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( VERSION to SERIALIZATION_VERSION, METADATA to obj( COLUMNS to frame.columnNames(), + TYPES to frame.schema().columns.values.map { colSchema -> + createJsonTypeDescriptor(colSchema) + }, NROW to frame.rowsCount(), NCOL to frame.columnsCount() ), diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt index a5a4dc249b..b76383495a 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/JupyterHtmlRenderer.kt @@ -2,6 +2,10 @@ package org.jetbrains.kotlinx.dataframe.jupyter import com.beust.klaxon.json import org.jetbrains.kotlinx.dataframe.api.take +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData @@ -69,10 +73,10 @@ internal inline fun JupyterHtmlRenderer.render( !ideBuildNumber.supportsDynamicNestedTables() -> { json { obj( - "nrow" to df.size.nrow, - "ncol" to df.size.ncol, - "columns" to df.columnNames(), - "kotlin_dataframe" to encodeFrame(df.take(limit)), + NROW to df.size.nrow, + NCOL to df.size.ncol, + COLUMNS to df.columnNames(), + KOTLIN_DATAFRAME to encodeFrame(df.take(limit)), ) }.toJsonString() } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt index 3e6d8c9b85..2bff506bcd 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeJson.kt @@ -3,9 +3,15 @@ package org.jetbrains.kotlinx.dataframe.impl.io import com.beust.klaxon.JsonArray import com.beust.klaxon.JsonObject import com.beust.klaxon.KlaxonJson -import org.jetbrains.kotlinx.dataframe.* -import org.jetbrains.kotlinx.dataframe.api.* -import org.jetbrains.kotlinx.dataframe.api.name +import org.jetbrains.kotlinx.dataframe.AnyCol +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.ColumnsContainer +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.indices +import org.jetbrains.kotlinx.dataframe.api.isList +import org.jetbrains.kotlinx.dataframe.api.rows +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.take import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.FrameColumn @@ -16,13 +22,17 @@ import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAM import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW +import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPE import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions import org.jetbrains.kotlinx.dataframe.io.arrayColumnName import org.jetbrains.kotlinx.dataframe.io.valueColumnName +import org.jetbrains.kotlinx.dataframe.name +import org.jetbrains.kotlinx.dataframe.ncol +import org.jetbrains.kotlinx.dataframe.nrow import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema -import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import org.jetbrains.kotlinx.dataframe.typeClass import java.awt.image.BufferedImage import java.io.IOException @@ -47,18 +57,12 @@ internal object SerializationKeys { const val VERSION = "\$version" const val COLUMNS = "columns" const val KOTLIN_DATAFRAME = "kotlin_dataframe" + const val TYPE = "type" const val TYPES = "types" } -/** - * Changes: - * 1.0.0: - * - - * 2.0.0: - * - - * 2.1.0: - * - Added "schema" entry to metadata - */ +// See docs/serialization_format.md for a description of +// serialization versions and format. internal const val SERIALIZATION_VERSION = "2.1.0" internal fun KlaxonJson.encodeRowWithMetadata( @@ -76,12 +80,8 @@ internal fun KlaxonJson.encodeRowWithMetadata( METADATA to obj( KIND to ColumnKind.Group.toString(), COLUMNS to schema.columns.keys, - TYPES to schema.columns.values.map { colSchema: ColumnSchema -> - when(colSchema.kind) { - ColumnKind.Value -> colSchema.type.toString() - ColumnKind.Group -> null - ColumnKind.Frame -> null - } + TYPES to schema.columns.values.map { columnSchema -> + createJsonTypeDescriptor(columnSchema) } ), ) @@ -98,12 +98,8 @@ internal fun KlaxonJson.encodeRowWithMetadata( METADATA to obj( KIND to ColumnKind.Frame.toString(), COLUMNS to schema.columns.keys, - TYPES to schema.columns.values.map { colSchema: ColumnSchema -> - when(colSchema.kind) { - ColumnKind.Value -> colSchema.type.toString() - ColumnKind.Group -> null - ColumnKind.Frame -> null - } + TYPES to schema.columns.values.map { columnSchema -> + createJsonTypeDescriptor(columnSchema) }, NCOL to col[index].ncol, NROW to col[index].nrow @@ -175,6 +171,16 @@ private fun encodeBufferedImageAsBase64( } } +private fun createJsonTypeDescriptor(columnSchema: ColumnSchema): JsonObject { + return JsonObject( + mutableMapOf(KIND to columnSchema.kind.toString()).also { + if (columnSchema.kind == ColumnKind.Value) { + it.put(TYPE, columnSchema.type.toString()) + } + } + ) +} + internal fun KlaxonJson.encodeFrameWithMetadata( frame: AnyFrame, rowLimit: Int? = null, @@ -284,7 +290,9 @@ internal fun KlaxonJson.encodeDataFrameWithMetadata( VERSION to SERIALIZATION_VERSION, METADATA to obj( COLUMNS to frame.columnNames(), - TYPES to frame.columnTypes().map { it.toString() }, + TYPES to frame.schema().columns.values.map { colSchema -> + createJsonTypeDescriptor(colSchema) + }, NROW to frame.rowsCount(), NCOL to frame.columnsCount() ), diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt index 6bbf78ae3b..28e4575ed6 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/RenderingTests.kt @@ -232,7 +232,14 @@ class RenderingTests : JupyterReplTestCase() { "${'$'}version": "2.1.0", "metadata": { "columns": ["group", "col3", "col4"], - "types": ["org.jetbrains.kotlinx.dataframe.DataRow<*>", "kotlin.String?", "org.jetbrains.kotlinx.dataframe.DataFrame<*>"], + "types": [{ + "kind": "ColumnGroup" + }, { + "kind": "ValueColumn", + "type": "kotlin.String?" + }, { + "kind": "FrameColumn" + }], "nrow": 3, "ncol": 3 }, @@ -245,7 +252,13 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "ColumnGroup", "columns": ["col1", "col2"], - "types": ["kotlin.String", "kotlin.Int"] + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }, { + "kind": "ValueColumn", + "type": "kotlin.Int" + }] } }, "col3": "Foo", @@ -260,7 +273,10 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "FrameColumn", "columns": ["header"], - "types": ["kotlin.String"], + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }], "ncol": 1, "nrow": 3 } @@ -274,7 +290,13 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "ColumnGroup", "columns": ["col1", "col2"], - "types": ["kotlin.String", "kotlin.Int"] + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }, { + "kind": "ValueColumn", + "type": "kotlin.Int" + }] } }, "col3": "Bar", @@ -289,7 +311,10 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "FrameColumn", "columns": ["header"], - "types": ["kotlin.String"], + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }], "ncol": 1, "nrow": 3 } @@ -303,7 +328,13 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "ColumnGroup", "columns": ["col1", "col2"], - "types": ["kotlin.String", "kotlin.Int"] + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }, { + "kind": "ValueColumn", + "type": "kotlin.Int" + }] } }, "col3": null, @@ -318,7 +349,10 @@ class RenderingTests : JupyterReplTestCase() { "metadata": { "kind": "FrameColumn", "columns": ["header"], - "types": ["kotlin.String"], + "types": [{ + "kind": "ValueColumn", + "type": "kotlin.String" + }], "ncol": 1, "nrow": 3 } diff --git a/docs/serialization_format.md b/docs/serialization_format.md index 9270aad2c8..dff4923a2f 100644 --- a/docs/serialization_format.md +++ b/docs/serialization_format.md @@ -1,19 +1,34 @@ ## Serialization format for the Kotlin notebooks plugin -This document is an informal specification of the serialization format used for rendering Kotlin dataframes in the Kotlin notebooks plugin of IntelliJ IDEA. +This document is an informal specification of the serialization format used for +rendering Kotlin dataframes in the Kotlin notebooks plugin of IntelliJ IDEA. + +### Version 2.1.0 + +**1.0.0:** + + * ... + +**2.0.0:** + * ... + +**2.1.0:** + * Added a `types` property to dataframe and and row metadata. It contains column + information for all groups, frames and values. -### Version 2.0.0 ### Top level json structure ```json { - "$version": "2.0.0", + "$version": "2.1.0", "metadata": { "columns": [ string, ... ], // column names + "types": [ TypeDescriptor, ... ] // type description for each entry in "columns" "nrow": int, "ncol": int }, "kotlin_dataframe": [ Row, ... ] } ``` + ### Row ```json { @@ -27,23 +42,38 @@ This document is an informal specification of the serialization format used for "": NestedFrame } ``` + ### ColumnGroup ```json { "metadata": { - "kind": "ColumnGroup" + "kind": "ColumnGroup", + "columns": [ string, ... ], // column names in the group + "types": [ TypeDescriptor, ... ] // type description for each entry in "columns" }, "data": Row } ``` + ### NestedFrame ```json { "metadata": { "kind": "FrameColumn" + "columns": [ string, ... ], // column names in the frame + "types": [ TypeDescriptor, ... ] // type description for each entry in "columns" "nrow": int, "ncol": int }, "data": [ Row, ... ] } ``` + +### TypeDescriptor +```json +{ + "kind": "ValueColumn"|"ColumnGroup"|"FrameColumn" + "type": FQN + nullability identifier (?), e.g "Kotlin.String?" // Only available if kind == "ValueColumn" +} +``` +